Exemplo n.º 1
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
Exemplo n.º 2
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
Exemplo n.º 3
0
def _extract ( buf , nch ):

    """
    get next possible name components at current position

    arguments:
        buf  - current contents as list of chars
        nch  - char count to work with

    returns:
        component string if found, otherwise ''
    """

    if nch == 0: return ''
    lrw = buf[:nch]      # list of chars in possible component
#   print ( 'lwr=' , lrw )
    if lrw[0] == ',':
        if nch == 1 or not ellyChar.isWhiteSpace(lrw[1]): return ''
        lrw.pop(0)
        lrw.pop(0)

#   print ( 'lrw=' , lrw )
    if lrw[0] == '(':
        lrw.pop()        # remove any pair of parentheses before lookup
        lrw.pop(0)
#   print ( 'lrw=' , lrw )
    if len(lrw) > 2 and lrw[0] == '"' and lrw[-1] == '"':
        lrw.pop()        # remove any pair of double quotes before lookup
        lrw.pop(0)
    if len(lrw) > 0 and lrw[-1] == ',':
        lrw.pop()
#   print ( 'lrw=' , lrw )

    return ''.join(lrw) # possible name component as string
Exemplo n.º 4
0
def normalize ( s ):

    """
    convert all unrecognizable input chars to _ and any
    consecutive white spaces to a single space

    arguments:
        s   - Unicode string or char list to operate on
    returns:
        normalized sequence
    """

    spaced = False
    n = len(s)
    ns = [ ]
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif not ellyChar.isText(x):
            x = '_'
            spaced = False
        else:
            spaced = False
        ns.append(x)
    return ns
Exemplo n.º 5
0
    def findClose ( self , opn , cls ):

        """
        look ahead for closing bracket in input stream buffer

        arguments:
            self  -
            opn   - opening bracket
            cls   - closing bracket to look for

        returns:
            offset in stream if found, -1 otherwise
        """

        skp = 0    # skip count
        nos = 0    # offset in buffer
        nlm = len(self.buf)
        if nlm > NLM: nlm = NLM  # set lookahead limit
        while nos < nlm:
            if   self.buf[nos] == opn:  # another opening bracket means
                skp += 1                # to skip a closing one
            elif self.buf[nos] == cls:
                if skp > 0:             # check for skip
                    skp -= 1
                elif nos + 1 == nlm or ellyChar.isWhiteSpace(self.buf[nos+1]):
                    return nos          # offset for closure
            nos += 1
        return -1
Exemplo n.º 6
0
    def normalize(self, s):
        """
        convert all unrecognizable input chars to _ and any
        consecutive white spaces to a single space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print ( '__ normalize' )
        spaced = False
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            if ellyChar.isLetter(x):
                spaced = False
            elif ellyChar.isWhiteSpace(x):
                if spaced: continue
                x = ' '
                spaced = True
            elif not ellyChar.isText(x):
                x = '_'
                spaced = False
            else:
                spaced = False
            ns.append(x)
        return ns
Exemplo n.º 7
0
    def findClose(self, opn, cls):
        """
        look ahead for closing bracket in input stream buffer

        arguments:
            self  -
            opn   - opening bracket
            cls   - closing bracket to look for

        returns:
            offset in stream if found, -1 otherwise
        """

        skp = 0  # skip count
        nos = 0  # offset in buffer
        nlm = len(self.buf)
        if nlm > NLM: nlm = NLM  # set lookahead limit
        while nos < nlm:
            if self.buf[nos] == opn:  # another opening bracket means
                skp += 1  # to skip a closing one
            elif self.buf[nos] == cls:
                if skp > 0:  # check for skip
                    skp -= 1
                elif nos + 1 == nlm or ellyChar.isWhiteSpace(
                        self.buf[nos + 1]):
                    return nos  # offset for closure
            nos += 1
        return -1
Exemplo n.º 8
0
def _extract ( buf , nch ):

    """
    get next possible name components at current position

    arguments:
        buf  - current contents as list of chars
        nch  - char count to work with

    returns:
        component string if found, otherwise ''
    """

    if nch == 0: return ''
    lrw = buf[:nch]      # list of chars in possible component
#   print 'lwr=' , lrw
    if lrw[0] == ',':
        if nch == 1 or not ellyChar.isWhiteSpace(lrw[1]): return ''
        lrw.pop(0)
        lrw.pop(0)

#   print 'lrw=' , lrw
    if lrw[0] == '(':
        lrw.pop()        # remove any pair of parentheses before lookup
        lrw.pop(0)
#   print 'lrw=' , lrw
    if len(lrw) > 2 and lrw[0] == '"' and lrw[-1] == '"':
        lrw.pop()        # remove any pair of double quotes before lookup
        lrw.pop(0)
    if len(lrw) > 0 and lrw[-1] == ',':
        lrw.pop()
#   print 'lrw=' , lrw

    return u''.join(lrw) # possible name component as string
Exemplo n.º 9
0
def normalize ( s ):

    """
    convert all non-ASCII nonalphanumeric in sequence to _ and 
    consecutive white spaces to a single space char

    arguments:
        s   - input sequence to operate on
    """

    spaced = False
    k = 0
    n = len(s)
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif ord(x) > 127:
            x = '_'
            spaced = False
        else:
            spaced = False
        s[k] = x
        k += 1
    s = s[:k]
Exemplo n.º 10
0
    def normalize(self, s):
        """
        overrides method in parent class to convert all letters to _
        and to eliminate any white space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print 'ZH normalize'
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            #           print '     x=' , x
            if ellyChar.isLetter(x):
                x = '_'
            elif ellyChar.isWhiteSpace(x):
                continue
#           print 'norm x=' , x
            ns.append(x)
#       print 'norm=' , ns
        return ns
Exemplo n.º 11
0
def matchtoo(txt, pnc, ctx):
    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
    #   print ( 'nomatch() ln=' , ln , txt )
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print ( 'check' , txt[-3:] )
    if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [
            'P', 'p', 'A', 'a'
    ] or txt[-4] != ' ':
        return False
    ch = txt[-5]
    #   print ( 'ch=' , ch )
    if ellyChar.isDigit(ch):  # only 1 digit will be checked here!
        #       print ( 'ONE DIGIT' )
        return True  # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print ( 'nn=' , nn )
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [' ', '-']:
            return False
    wd = ''.join(txt[:-nn]).lower()

    #   print ( 'wd=' , wd )
    if wd in [
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten', 'eleven', 'twelve'
    ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
Exemplo n.º 12
0
def spc(c):
    """
    special space check
    arguments:
        c  - single char
    returns:
        True if white space or null, False otherwise
    """
    return c == '' or ellyChar.isWhiteSpace(c)
Exemplo n.º 13
0
def spc ( c ):

    """
    special space check
    arguments:
        c  - single char
    returns:
        True if white space or null, False otherwise
    """
    return c == '' or ellyChar.isWhiteSpace(c)
Exemplo n.º 14
0
def matchtoo ( txt , pnc , ctx ):

    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
#   print 'nomatch() ln=' , ln , txt
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print 'check' , txt[-3:]
    if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ':
        return False
    ch = txt[-5]
#   print 'ch=' , ch
    if ellyChar.isDigit(ch):        # only 1 digit will be checked here!
#       print 'ONE DIGIT'
        return True                 # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print 'nn=' , nn
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [ ' ' , '-' ]:
            return False
    wd = ''.join(txt[:-nn]).lower()

#   print 'wd=' , wd
    if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' ,
               'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
Exemplo n.º 15
0
    def _reload ( self ):

        """
        refill input line buffer and compute indentation

        arguments:
            self

        returns:
            True on success if buffer has at least one char, False otherwise
        """

#       print '_reload'
        if len(self.buf) > 0:
            return True                  # no refilling needed

        if self._eof:
            return False                 # must return immediately on previous EOF

        while len(self.buf) == 0:

#           print 'get more text'

            try:
                if self._prmpt: sys.stdout.write('>> ')
                s = self.inp.readline()  # new text line to add
                if len(s) == 0:
#                   print '**EOF'
                    self._eof = True
                    return False         # EOF
                s = s.decode('utf8')     # to tell Python how to interpret input string
#               print 'raw s=' , s
            except IOError:
                print >> sys.stderr , '** char stream ERROR'
                return False             # treat read failure as empty line

            k = 0
            while k < len(s):            # count leading white space chars
                if s[k] == NL: break     # but stop at end of line
                if not ellyChar.isWhiteSpace(s[k]): break
                k += 1
            self._in = k                 # save indentation level
            s = s[k:]
            self.buf = list(s)           # put unindented text into buffer
#           print 'k=' , k , ', s=' , '"' + s + '"'
#           print self.buf
            if k > 0 and ellyConfiguration.noteIndentation:
                self.buf.insert(0,NL)    # if noted, indentation will break sentence

#           print 'len=' , len(self.buf)
            if len(self.buf) > 0:        # if no usable input, stop
                return True

        return False
Exemplo n.º 16
0
    def append(self, text):
        """
        add chars to end of buffer

        arguments:
            self  -
            text  - text to append, string or list of chars
        """

        if not isinstance(text, list):
            text = list(text)  # get new text as list if not already
        if len(self.buffer) > 0:
            if not ellyChar.isWhiteSpace(self.buffer[-1]) and text[0] != ' ':
                self.buffer.append(' ')  # put in space separator if needed
        self.buffer.extend(text)  # add new text
Exemplo n.º 17
0
    def atSpace(self):
        """
        look for space char at start of buffer

        arguments:
            self

        returns:
            True if found, False otherwise
        """

        if len(self.buffer) == 0:
            return False
        else:
            return ellyChar.isWhiteSpace(self.buffer[0])
Exemplo n.º 18
0
    def append ( self , text ):

        """
        add chars to end of buffer

        arguments:
            self  -
            text  - text to append, string or list of chars
        """

        if type(text) != list:          # get new text as list
            text = list(text)
        if len(self.buffer) > 0:
            if not ellyChar.isWhiteSpace(self.buffer[-1]) and text[0] != ' ':
                self.buffer.append(' ') # put in space separator if needed
        self.buffer.extend(text)        # add new text
Exemplo n.º 19
0
    def atSpace ( self ):

        """
        look for space char at start of buffer

        arguments:
            self

        returns:
            True if found, False otherwise
        """

        if len(self.buffer) == 0:
            return False
        else:
            return ellyChar.isWhiteSpace(self.buffer[0])
Exemplo n.º 20
0
    def skipSpaces(self):
        """
        skip over spaces at start of buffer

        arguments:
            self
        """

        n = len(self.buffer)
        if n == 0: return None
        k = 0
        while k < n:
            if not ellyChar.isWhiteSpace(self.buffer[k]):
                break
            k += 1
        self.buffer = self.buffer[k:]
        self._reset()
Exemplo n.º 21
0
    def skipSpaces ( self ):

        """
        skip over spaces at start of buffer

        arguments:
            self
        """

        n = len(self.buffer)
        if n == 0: return None
        k = 0
        while k < n:
            if not ellyChar.isWhiteSpace(self.buffer[k]):
                break
            k += 1
        self.buffer = self.buffer[k:]
        self._reset()
Exemplo n.º 22
0
def compile ( name , stb , defn , stem=None ):

    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new BSDDB database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary
        stem  - optional stemmer for indexing

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0

#   print >> sys.stderr , 'compiled stb=' , stb , 'stem=' , stem , 'db=' , db

    if stb == None :
        print >> sys.stderr, 'no symbol table'
        raise ellyException.TableFailure
    if db  == None :
        print >> sys.stderr, 'no Python db package'
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False)
    except ellyException.FormatFailure:              # should never need this
        print >> sys.stderr , 'unexpected failure with zero features'
        raise ellyException.TableFailure

#   print >> sys.stderr , 'zfs=' , zfs               # hexadecimal for all features off

    tsave = ''                                       # original term
    dsave = ''                                       #          definition

    try:
        filn = name + vocabulary                     # where to put vocabulary database
        try:
            os.remove(filn)                          # delete the file if it exists
        except OSError:
            print >> sys.stderr , 'no' , filn
        dbs = db.DB()                                # create new database
        dbs.set_flags(db.DB_DUP)                     # keys may identify multiple records
        dbs.open(filn,None,db.DB_HASH,db.DB_CREATE)  # open new database file
#       print >> sys.stderr , 'creating' , filn

        r = None                                          # for error reporting

        while True:                                       # process vocabulary records

            try:
#               print >> sys.stderr , '------------'
                r = defn.readline()                       # next definition
                if len(r) == 0: break                     # stop on EOF
                if r[0] == '#': continue                  # skip comment line
#               print >> sys.stderr , 'def=' , r

                k = r.find(':')                           # look for first ':'
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()                                # report error and quit entry
                    continue

                t = r[:k].strip()                         # term to go into dictionary
                d = r[k+1:].strip()                       # its definition
                tsave = t                                 # save for any error reporting
                dsave = d                                 #

#               print >> sys.stderr , ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>'
                if len(t) == 0 or len(d) == 0:
                    _err()                                # quit on missing parts
                    continue
                c = t[0]
                if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"':
                    _err('bad term')
                    continue

                n = toIndex(t)                            # get part of term to index
                if n == 0:
                    _err()                                # quit on bad term
                    continue
                w = t[:n]                                 # first word of term to define  
                if stem != None:
                    try:
                        w = stem.simplify(w)              # reduce for lookup key
                    except ellyException.StemmingError:
                        _err('bad stemming logic')
                        continue
#               print >> sys.stderr , '  w=' , w
                lcw = lcAN(w)                             # convert to ASCII lower case
#               print >> sys.stderr , 'lcw=' , '"' + lcw + '"'

                ns = syntaxSpecification.scan(d)          # find extent of syntax info
#               print >> sys.stderr , 'ns=' , ns
                if ns <= 0: _err('bad syntax specification')
#               print >> sys.stderr , 'PoS=' , d[:ns]

                syn = d[:ns]                              # syntax info as string
                d = d[ns:].strip()                        # rest of definition

                try:
#                   print >> sys.stderr , 'VT syn=' , syn
                    ss = SSpec(stb,syn)                   # decode syntax info to get
#                   print >> sys.stderr , 'VT ss =' , ss
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                    continue
                cat = str(ss.catg)                        #   syntax category
                syf = ss.synf.positive.hexadecimal(False) #   syntactic flags
#               print >> sys.stderr , 'syf=' , syf

                smf = zfs                                 # initialize defaults for
                pb = '0'                                  #   cognitive semantics
                cn = '-'                                  #

#               print >> sys.stderr , '0:d=[' + d + ']'
                if len(d) > 1:                            # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':                      # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                                continue
                            d = d[2:].strip()             # skip over
                        else:
                            ns = featureSpecification.scan(d) # look for ']' of features
#                           print >> sys.stderr , 'ns=' , ns
                            if ns < 0:
                                _err()
                                continue
                            sem = d[:ns]                  # get semantic features
                            d = d[ns:].strip()            # skip over
                            try:
#                               print >> sys.stderr , 'smf=' , smf
                                fs = FSpec(stb,sem,True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                                continue
                            smf = fs.positive.hexadecimal(False) # convert to hex

#                       print >> sys.stderr , '1:d=[' + d + ']'
                        ld = len(d)
#                       print >> sys.stderr , 'ld=' , ld
                        if ld == 0:
                            _err('missing plausibility')
                            continue
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1                       # take any plus or minus sign
                        while np < ld:                    # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print >> sys.stderr , 'np=' , np
                        if np == 0:
                            _err('missing plausibility')
                            continue
                        pb = d[:np]                       # plausibility bias
#                       print >> sys.stderr , 'pb=' , pb
                        d = d[np:]
                        ld = len(d)
#                       print >> sys.stderr , '2:d=[' + d + ']'
                        if ld > 1:                        # any more to process?
                            c = d[0]                      # get next char after bias
                            d = d[1:]                     # advance scan
                            ld -= 1
                            if c == '/':                  # check for explicit concept
#                               print >> sys.stderr , 'getting concept'
                                np = 0
                                while np < ld:            # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                    continue
                                cn = d[:np]               # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()                    # signal bad format
                                continue
                        elif ld > 0:
                            _err()                        # unidentifiable trailing text
                            continue

                d = d.strip()                             # rest of definition
#               print 'rest of d=' , d
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')
                        continue

                ld = [ ]                            # for normalizing definition

                k = 0                               # count spaces removed
                sd = ''                             # previous char seen
                for cd in d:                        # scan all chars in translation
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':    # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'                    # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print >> sys.stderr , '** WARNING \'=\' followed by \'=\''
                        print >> sys.stderr , '*  at [' , tsave , ']'

                    sd = cd
                    ld.append(cd)                   # add char to reformatted definition

                if k > 0:
                    d = ''.join(ld)                 # definition with spaces removed

#               print >> sys.stderr , '3:d=[' + d + ']'

                vrc = [ t , ':' , cat , syf , smf ,
                        pb , cn ]                         # start BdB data record
                vss = u' '.join(vrc)                      # convert to string
                vss += u' ' + d                           # fill out record with rest of input
#               print >> sys.stderr , 'type(vss)=' , type(vss)
                rss = vss.encode('utf8')                  # convert to UTF-8

#               print >> sys.stderr , 'rec=' , vrc , 'tra=' , d
#               print >> sys.stderr , '   =' , rss

            except ellyException.FormatFailure:
                print >> sys.stderr , '*  at [' , tsave ,
                if dsave != None:
                    print >> sys.stderr , ':' , dsave ,
                print >> sys.stderr , ']'
                continue

#           print >> sys.stderr , 'lcw=' , lcw
            dbs.put(lcw,rss)                          # save in database
#           print >> sys.stderr , 'saved'

#       print >> sys.stderr , 'DONE'
        dbs.close()                                   # clean up

    except StandardError , e:                         # catch any other errors
        print >> sys.stderr , '**' , e
        print >> sys.stderr , '*  at' , r
        nerr += 1
Exemplo n.º 23
0
    def read(self):
        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

        #       print 'reading: buf=' , self.buf

        while True:

            if not self._reload(
            ):  # check if buffer empty and reload if needed
                return END  # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)  # next raw char in buffer

            if c == SHYP:  # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):  # unrecognizable Elly char?
                #               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    if ellyConfiguration.language != 'ZH':
                        c = '_'  # special handling for non-Chinese input
                elif not c in [u'\uff0c', u'\u3002']:
                    #                   print 'replace' , c , 'with NBSP'
                    c = NBSP  # by default, replace with no-break space

            lc = self._lc  # copy saved last char
            #           print 'lc=' , ord(lc)
            self._lc = c  # set new last char

            #           if c == "'":
            #               print 'apostrophe' , self.buf

            #           print 'c=' , '<' + c + '>'

            if c == HYPH:  # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':  # check for ellipsis
                bb = self.buf
                bl = len(bb)
                #               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[
                        2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:  # check for single quote
                #               print 'at single quote'
                nc = self.peek()  # look at next char
                #               print 'next=' , nc
                if nc == RSQm:  # doubling of single quote?
                    self.buf.pop(0)  # if so, combine two single quotes
                    c = RDQm  # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:  # always ignore
                continue
            elif c == NL:  # special handling of \n
                #               print 'got NL'
                nc = self.peek()  # look at next char

                while nc == CR:
                    self.buf.pop(0)  # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)  # special case when NL can be returned
                    break

                if nc == NL:  # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue  # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
                    #                   print 'NL to SP, lc=' , ord(lc)
                    c = SP  # convert NL to SP if not before another NL
            else:
                #               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP  # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(
                    lc):  # preceding char was not white space?
                #               print 'return SP'
                break  # if so, keep space in stream

        return c  # next filtered char
Exemplo n.º 24
0
    def match ( self , txt , pnc , nxt ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars up to and including punctuation char
            pnc   - punctuation char
            nxt   - single char after punctuation

        returns:
            True on match, False otherwise
        """

        self.noteBracketing(pnc)  # just in case this is bracketing

        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

        txl = txt[-self.maxl:] if len(txt) > self.maxl else txt
        
        txs = map(lambda x: x.lower(),txl) # actual left context for matching

#       print 'txs= ' + str(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        lt = len(txs)             # its length

#       print len(lp) , 'patterns'

        for p in lp:              # try matching each pattern

            if p.left != None:
                
                n = len(p.left)   # assume each pattern element must match one sequence char
#               print n , 'pattern elements' , lt , 'chars'
                if n > lt:
                    continue      # fail immediately because of impossibility of match
                if n < lt and ellyChar.isLetterOrDigit(txs[-n-1]):
                    continue      # fail because of text to match is after alphanumeric
                t = txs if n == lt else txs[-n:]
#               print 'pat=' , '[' + ellyWildcard.deconvert(p.left) + ']'
                if not ellyWildcard.match(p.left,t,0):
                    continue

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'nxt=' , '[' + nc + ']'
#           print 'pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           if len(p.right) > 0: print '    ' ,  ord(p.right)

            if p.right == u'' or p.right == nxt: # check for specific char after possible stop
                return True
            if p.right == ellyWildcard.cCAN:     # check for nonalphanumeric
                if nxt == u'' or not  ellyChar.isLetterOrDigit(nxt):
                    return True
            if p.right == ellyWildcard.cSPC:     # check for white space
#               print 'looking for space'
                if nxt == u'' or nxt == u' ' or nxt == u'\n':
                    return True
            if p.right == u'.':                  # check for any punctuation
                if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt):
                    return True

        return False
Exemplo n.º 25
0
    def getNext(self):
        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

        #       print ( 'getNext' )

        self.resetBracketing()
        inBrkt = False

        nspc = 0  # set space count

        sent = []  # list buffer to fill

        x = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:  # EOF check
            return None

        c = END  # reset
        lc = END

        #       print ( 'x=' , '<' + x + '>' , ord(x) )
        self.inp.unread(x, SP)  # put first char back to restore input
        #       print ( '0  <<' , self.inp.buf )

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0  # alphanumeric count in sentence

        while True:

            x = self.inp.read()  # next input char

            if x == END:  # handle any EOF
                break

#           print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' )
#           print ( 'sent=' , sent , 'nspc=' , nspc )

# check for table delimiters in text

            if len(sent) == 0:
                #               print ( 'table' )
                #               print ( '1  <<' , self.inp.buf )

                if x == '.' or x == '-':  # look for multiple '.' or '-'
                    while True:  # scan up to end of current buffering
                        y = self.inp.read()  #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break  #
                    continue  # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

            #           print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' )
            if lc == SP or lc == END:  # normalize chars for proper bracketing
                if x == SQuo:  #
                    x = LSQm  # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:  #
                    x = LDQm  # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END:  #
                if x == SQuo:  # a SQuo followed by a space becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by a space becomes RDQm
                    x = RDQm  #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:  # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm  #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(
                x)  # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

            #           print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt )

            sent.append(c)  # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue  # if alphanumeric, just add to sentence

            if c == SP:
                continue  # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()  # remove from sentence chars
                break

            # certain Unicode punctuation will always break

            if c in Hards:
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

            #           print ( '0  <<' , self.inp.buf )

            #           print ( 'sent=' , sent[:-1] )
            #           print ( 'punc=' , '<' + c + '>' )
            #           print ( 'next=' , cx )
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1], c, cx):
                    #                   print ( 'stop exception MATCH' )
                    if self.drop:
                        sent.pop()  # remove punctuation char from sentence
                        lc = SP
                    continue

#           print ( 'no stop exception MATCH for' , c )

#           print ( '@1  <<' , self.inp.buf )

# handle any nonstandard punctuation

            exoticPunctuation.normalize(c, self.inp)

            #           print ( '@2  <<' , self.inp.buf )

            # check for dash

            if c == '-':
                d = self.inp.read()
                if d == '-':
                    #                   print ( 'dash' )
                    while True:
                        d = self.inp.read()
                        if d != '-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print ( '@3  c=' , c , inBrkt )

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

                #               print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) )

                if not inBrkt:
                    #                   print ( sent , 'so far' )
                    z = self.inp.read()
                    if self.shortBracketing(sent, z):
                        break
                    self.inp.unread(z)
                    #                   print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' )
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
                    #                   print ( 'stop+quote' )
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
                        #                       print ( 'stop+quote+quote' )
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print ( 'continue' )
                continue

            elif not c in Stops:
                continue

            else:
                #               print ( 'check stopping!' )
                d = self.inp.read()
                #               print ( '@3  <<' , self.inp.buf )

                if d == None: d = '!'
                #               print ( 'stop=' , '<' + c + '> <' + d + '>' )

                #               print ( 'ellipsis check' )
                if c == '.' and c == d:
                    if self.inp.peek() != c:  # look for third '.' in ellipsis
                        self.inp.unread(d)  # if none, keep only first '.'
                    else:
                        self.inp.skip()  # found ellipsis
                        sent.append(d)  # complete it in sentence buffer
                        sent.append(d)  #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(
                                SP
                            )  # if part of token, put in space as separator
                    continue

                if c == ELLP:
                    #                   print ( 'found Unicode ellipsis, d=' , d )
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(
                            d)  # super special case of bad punctuation
                        self.inp.unread(' ')  # put in implied period and space
                        self.inp.unread('.')  #

                # special check for multiple stops

#               print ( 'next char d=' , d , ord(d) if d != END else 'NONE' )
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP  # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent, d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
                    #                   print ( 'no space after punc' )
                    continue

                # if no match for lookahead, put back

                elif d != END:
                    #                   print ( 'unread d=' , d )
                    self.inp.unread(d)

#               print ( 'possible stop' )

# check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
                    #                   print ( 'sent=' , sent )
                    #                   print ( 'ixn=' ,ixn )
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
                        #                       print ( 'cxn=' , cxn )
                        if not ellyChar.isDigit(cxn): break
#                   print ( 'break: ixn=' , ixn , 'ixb=' , ixb )
                    if ixn < ixb and cxn in [' ', '-', '+']:
                        prvw = self.inp.preview()
                        #                       print ( 'prvw=' , prvw )
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(
                                prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
                    #                   print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() )
                    #                   print ( 'nspc=' , nspc )
                    if c in [':', ';'] or nspc < 3:
                        sent.append(d)
                        #                       print ( 'add' , '<' + d + '> to sentence' )
                        #                       print ( 'sent=' , sent )
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print ( '@4  <<' , self.inp.buf )
                cx = self.inp.peek()
                if cx == None: cx = '!!'
                #               print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent )
                #               print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt )
                if nAN > 1:
                    break

        if sent == ['\u2026']:  # special case of sentence
            return list("-.-")  # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Exemplo n.º 26
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
Exemplo n.º 27
0
    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            #########################################
            # accumulate chars and count alphanumeric
            #########################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #

            inBrkt = self.checkBracketing(x)    # do bracket checking with modified chars

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # but buffer original chars
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            z = self.inp.peek()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , '<' + z + '>'
            if c in Stops and self.stpx.match(sent[:-1],c,z):
#               print 'exception MATCH'
                if self.drop:
                    sent.pop()   # remove punctuation char from sentence
                    lc = SP
                continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                continue

            elif not c in Stops or inBrkt:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(c)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

                # final check: is sentence long enough?

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
                if nAN > 1 and not inBrkt:
                    break

        if len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Exemplo n.º 28
0
    def _matchAN ( self , ts ):

        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)            # look for month to start date string
        comma = False
#       print 'month len=' , k
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1                     # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)          # look for day of month
#           print 'day len=' , k
            if k == 0:
                self._dy = [ ]
                k = self._aYear(t)     # look for year immediately following
                if k > 0:
                    return tl - len(t) + k
                else:
                    return 0
#           print 'ts=' , ts
            tl = len(t)                # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0:
#               print 'no year tl=' , tl , 'k=' , k , t
                return len(ts) - tl + k
            if t[0] == u',':           # look for comma after day
                t = t[1:]             # if found, remove and note
                comma = True
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)         # look for year
#           print 'year len=' , k
            lnt = len(t)
            if comma and k < lnt and t[k] == ',':
                k += 1                 # remove comma after year if paired
#           print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t
            return len(ts) - len(t) + k

        k = self._aDay(t)              # look for day of month to start date string
#       print 'start day len=' , k
        if k == 0:
            self._dy = [ ]
        elif k > 0 and k < tl:         # cannot be just bare number by itself
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
#           print 'new t=' , t
            if (k > 2 and len(t) > 2 and
                t[0] == u' ' and
                t[1].upper() == 'O' and
                t[2].upper() == 'F'):
                t = t[3:]              # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)        # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
#           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':           # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
                comma = True
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)         # look for year
            if k > 0:
                if comma and k < len(t) and t[k] == ',': k += 1
                return ntl + k + nd    # full date found
            else:
                return ntl - nd        # only month and day of date found

#       print 'look for year only in' , t
        k = self._aYear(t)
        if k > 0:
            if k == tl:
                return k
            elif not ellyChar.isLetter(t[k]) and t[k] != '-':
                return k

        return 0                       # nothing found
Exemplo n.º 29
0
    def _matchAN(self, ts):
        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)  # look for month to start date string
        comma = False
        #       print 'month len=' , k
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1  # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)  # look for day of month
            #           print 'day len=' , k
            if k == 0:
                self._dy = []
                k = self._aYear(t)  # look for year immediately following
                if k > 0:
                    return tl - len(t) + k
                else:
                    return 0
#           print 'ts=' , ts
            tl = len(t)  # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0:
                #               print 'no year tl=' , tl , 'k=' , k , t
                return len(ts) - tl + k
            if t[0] == u',':  # look for comma after day
                t = t[1:]  # if found, remove and note
                comma = True
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)  # look for year
            #           print 'year len=' , k
            lnt = len(t)
            if comma and k < lnt and t[k] == ',':
                k += 1  # remove comma after year if paired
#           print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t
            return len(ts) - len(t) + k

        k = self._aDay(t)  # look for day of month to start date string
        #       print 'start day len=' , k
        if k == 0:
            self._dy = []
        elif k > 0 and k < tl:  # cannot be just bare number by itself
            tl = len(ts)  # _aDay may have rewritten alphabetic day
            t = t[k:]
            #           print 'new t=' , t
            if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O'
                    and t[2].upper() == 'F'):
                t = t[3:]  # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)  # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
            #           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':  # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
                comma = True
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)  # look for year
            if k > 0:
                if comma and k < len(t) and t[k] == ',': k += 1
                return ntl + k + nd  # full date found
            else:
                return ntl - nd  # only month and day of date found

#       print 'look for year only in' , t
        k = self._aYear(t)
        if k > 0:
            if k == tl:
                return k
            elif not ellyChar.isLetter(t[k]) and t[k] != '-':
                return k

        return 0  # nothing found
Exemplo n.º 30
0
def _limit ( buffr , hstry ):

    """
    get length of next possible name component in buffer

    arguments:
        buffr - list of chars
        hstry - how much matched already

    returns:
        number of chars in continuation of last component, 0 for no next component
    """

    lnb = len(buffr)
    if lnb == 0: return 0

    bix = 0
    quot = False                           # indicate component starting with "
    parn = False                           #                             with (
    cmma = False                           #                             with ,
#   print ( '_limit buffr=' , buffr , 'hstry=' , hstry )
    if buffr[0] == ',':                    # handle possible leading comma
        if hstry == 0 or lnb < 4: return 0
        bix += 1
        if ellyChar.isWhiteSpace(buffr[1]):
            bix += 1
        cmma = True
#       print ( 'for comma, bix=' , bix )

    if buffr[bix] == '(':                  # handle short name in parentheses
        bix += 1
        parn = True
    if buffr[bix] == '"':                  # handle short name in double quotes
        bix += 1
        quot = True
#       print ( 'parn=' , parn , 'quot=' , quot )
    if parn or quot:
#       print ( 'enclosed component from' , buffr[bix:] )
        while bix < lnb:                   # collect letters for name
            chx = buffr[bix]
            if ellyChar.isWhiteSpace(chx):
                break
            elif not quot and parn and chx == ')':
                return bix + 1             # add trailing parenthesis
            elif quot and chx == '"':
                if bix + 1 < lnb and parn and buffr[bix+1] == ')':
                    return bix + 2         # add trailing quote and parenthesis
                elif not parn:
                    return bix + 1         # add trailing quote only
                else:
                    return 0               # no match
            elif chx == '.':
                return bix + 1             # add trailing period
            elif not ellyChar.isLetter(chx):
                break                      # unrecognizable char for name
            bix += 1
#       print ( 'no closure' )
        return 0
    else:
#       print ( 'find component in' , buffr[bix:] )
        while bix < lnb:
            chx = buffr[bix]               # collect letters for name
#           print ( 'chx=' , chx )
            if chx == "'":
                if bix + 2 < lnb:
                    chn = buffr[bix+1]
                    if ellyChar.isWhiteSpace(chn):
                        break
                    if chn == 's' and not ellyChar.isLetter(buffr[bix+2]):
                        break
            elif not ellyChar.isLetter(chx):
                if chx == '.':
                    bix += 1
#                   print ( 'increment bix=' , bix )
                break
            bix += 1

        if bix == lnb:

#           print ( 'ran out of chars' )
            return bix                     # running out of chars means match

        else:

#           getting here means that more text follows limit
#           and so we may have to pick up extra chars here

            chx = buffr[bix]
#           print ( 'next chx=' , chx , 'bix=' , bix )
            if ellyChar.isWhiteSpace(chx) or chx == "'":
                return bix                 # component can be terminated by space or (')
            elif chx == ',':
                if cmma:
                    return bix + 1         #     or comma when sequence starts with comma
                else:
                    return bix             #              when there is no starting comma
            elif ellyChar.isLetter(chx):
                return bix                 #     or letter, implying previous char was '.'
            else:
                return 0                   # failure to find name limit
Exemplo n.º 31
0
    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , ord(c)
                c = NBSP                 # if so, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif not ellyChar.isWhiteSpace(c):
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char
Exemplo n.º 32
0
    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()
        inBrkt = False

        nspc = 0           # set space count

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent , 'nspc=' , nspc

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(x)    # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , cx
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1],c,cx):
#                   print 'stop exception MATCH'
                    if self.drop:
                        sent.pop()   # remove punctuation char from sentence
                        lc = SP
                    continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c , inBrkt

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
#                   print 'stop+quote'
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
#                       print 'stop+quote+quote'
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print 'continue'
                continue

            elif not c in Stops:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(d)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                if c == ELLP:
#                   print 'found Unicode ellipsis, d=' , d
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(d)   # super special case of bad punctuation
                        self.inp.unread(' ') # put in implied period and space
                        self.inp.unread('.') #

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

#               print 'possible stop'

                # check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
#                   print 'sent=' , sent
#                   print 'ixn=' ,ixn
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
#                       print 'cxn=' , cxn
                        if not ellyChar.isDigit(cxn): break
#                   print 'break: ixn=' , ixn , 'ixb=' , ixb
                    if ixn < ixb and cxn in [ ' ' , '-' , '+' ]:
                        prvw = self.inp.preview()
#                       print 'prvw=' , prvw
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
#                   print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview()
#                   print 'nspc=' , nspc
                    if c in [ ':' , ';' ] or nspc < 3:
                        sent.append(d)
#                       print 'add' , '<' + d + '> to sentence'
#                       print 'sent=' , sent
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
#               print 'nAN=' , nAN , 'inBrkt=' , inBrkt
                if nAN > 1:
                    break

        if sent == [ u'\u2026' ]:  # special case of sentence
            return list("-.-")     # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Exemplo n.º 33
0
    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if c == SHYP:                # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    c = '_'              # special handling for Chinese
                else:
#                   print 'replace' , c , 'with NBSP'
                    c = NBSP             # by default, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

#           print 'c=' , '<' + c + '>'

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':               # check for ellipsis
                bb = self.buf
                bl = len(bb)
#               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:              # check for single quote
#               print 'at single quote'
                nc = self.peek()         # look at next char
#               print 'next=' , nc
                if nc == RSQm:           # doubling of single quote?
                    self.buf.pop(0)      # if so, combine two single quotes
                    c = RDQm             # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char
Exemplo n.º 34
0
    def _reload(self):
        """
        refill input line buffer and compute indentation

        arguments:
            self

        returns:
            True on success if buffer has at least one char, False otherwise
        """

        #       print '_reload'
        bex = ''  # save space char at end of buffer
        bcn = len(self.buf)
        if bcn > 1:
            return True  # no refilling needed
        elif bcn == 1:
            if ellyChar.isWhiteSpace(self.buf[0]):
                bex = self.buf[0]  # special case when only space char left
                self.buf = []  # refill to get chars after that space
            else:
                return True  # no refilling yet

        if self._eof:
            return False  # must return immediately on previous EOF

        while len(self.buf) == 0:

            #           print 'get more text'

            try:  # read in UTF8 line from input stream
                if self._prmpt: sys.stdout.write('>> ')
                s = self.inp.readline()  # new text line to add
                #               print 's=' , s
                if len(s) == 0:
                    #                   print '**EOF'
                    self._eof = True
                    return False  # EOF
                s = s.decode('utf8')  # convert UTF8 to Unicode string
#               print 'raw s=' , s
            except IOError:
                print >> sys.stderr, '** char stream ERROR'
                return False  # treat read failure as empty line

            k = 0
            while k < len(s):  # count leading white space chars
                if s[k] == NL: break  # but stop at end of line
                if not ellyChar.isWhiteSpace(s[k]): break
                k += 1
            self._in = k  # save indentation level
            s = s[k:]
            self.buf = list(s)  # put unindented text into buffer
            #           print 'k=' , k , ', s=' , '"' + s + '"'
            #           print self.buf
            if k > 0 and ellyConfiguration.noteIndentation:
                self.buf.insert(
                    0, NL)  # if noted, indentation will break sentence

#           print 'len=' , len(self.buf)
            if len(self.buf) > 0:  # if usable input, stop filling
                if bex != '':  # but restore any saved space char from buffer
                    self.buf.insert(0, bex)
                return True

        return False  # cannot refill, ignore trailing space char
Exemplo n.º 35
0
def build(name, stb, defn):
    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new SQLite database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0
    cdb = None  # SQLite db connection
    cur = None  # SQLite db cursor

    #   print ( 'built stb=' , stb )

    if stb == None:
        print('no symbol table', file=sys.stderr)
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb, '[$]', True).positive.hexadecimal(False)
    except ellyException.FormatFailure:  # should never need this
        print('unexpected failure with zero features', file=sys.stderr)
        raise ellyException.TableFailure

#   print ( 'zfs=' , zfs )                           # hexadecimal for all features off

    tsave = ''  # original term
    dsave = ''  #          definition

    try:
        filn = name + vocabulary  # where to put vocabulary database
        try:
            os.remove(filn)  # delete the file if it exists
        except OSError:
            print('no', filn,
                  file=sys.stderr)  # if no such file, warn but proceed

#### SQLite DB operations
####
        try:
            cdb = dbs.connect(filn)  # create new database
            cur = cdb.cursor()
            cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)")
            cdb.commit()
        except dbs.Error as e:
            print(e, file=sys.stderr)
            raise ellyException.TableFailure  # give up on any database failure

#       print ( 'creating' , filn )
#
####

        r = None  # for error reporting

        while True:  # process vocabulary definition records

            try:  # for catching FormatFailure exception
                #               print ( '------------' )
                r = defn.readline()  # next definition
                if len(r) == 0: break  # stop on EOF
                #               print ( type(r) , r )
                r = definitionLine.normalize(r)  #
                #               print ( 'to' , r )

                k = r.find(' : ')  # look for first ' : '
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()  # report error and quit entry

                t = r[:k].strip()  # term to go into dictionary
                d = r[k + 2:].strip()  # its definition
                tsave = t  # save for any error reporting
                dsave = d  #

                #               print ( ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' )
                if len(t) == 0 or len(d) == 0:
                    _err()  # quit on missing parts
                if ellyConfiguration.language == 'ZH':  # special key for Chinese
                    wky = toKeyZH(t[0])
                else:
                    c = t[0]
                    if not ellyChar.isLetterOrDigit(c) and not c in initChr:
                        _err('bad term')

                    n = delimitKey(t)  # get part of term to index
                    #                   print ( 'delimit=' , n )
                    if n <= 0:
                        _err()  # quit on bad term
                    wky = toKey(t[:n])  # key part of term to define

#               print ( '  SQLite key=' , wky )

#               print ( 'd=' , d )
                ns = syntaxSpecification.scan(d)  # find extent of syntax info
                #               print ( 'ns=' , ns , '"' + d[ns:] + '"' )
                if ns <= 0: _err('bad syntax specification')
                if not d[ns:] == '' and d[ns] != ' ':
                    _err('trailing chars in syntax specification')
                #               print ( 'PoS=' , d[:ns] )

                syn = d[:ns]  # syntax info as string
                d = d[ns:].strip()  # rest of definition

                try:
                    #                   print ( 'VT syn=' , syn )
                    ss = SSpec(stb, syn)  # decode syntax info
#                   print ( 'VT ss =' , ss )
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                cat = str(ss.catg)  #   syntax category
                cid = _smfchk[ss.catg]  #   associated semantic feature ID
                syf = ss.synf.positive.hexadecimal(False)  #   syntactic flags
                #               print ( 'cat=' , cat )
                #               print ( 'syf=' , syf )

                smf = zfs  # initialize defaults for
                pb = '0'  #   cognitive semantics
                cn = conceptualHierarchy.NOname  #

                #               print ( '0:d=[' + d + ']' )
                if len(d) > 1:  # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':  # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                            d = d[2:].strip()  # skip over
                        else:
                            ns = featureSpecification.scan(
                                d)  # look for ']' of features
                            #                           print ( 'ns=' , ns )
                            if ns < 0:
                                _err()
                            sem = d[:ns]  # get semantic features
                            d = d[ns:].strip(
                            )  # skip over for subsequent processing

                            sid = sem[1]  # feature ID
                            if sid != cid:
                                if cid != None:
                                    _err('inconsistent semantic feature id')
                                _smfchk[ss.catg] = sid

                            try:
                                #                               print ( 'smf=' , smf )
                                fs = FSpec(stb, sem, True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                            smf = fs.positive.hexadecimal(
                                False)  # convert to hex

#                       print ( '1:d=[' + d + ']' )
                        ld = len(d)
                        #                       print ( 'ld=' , ld )
                        if ld == 0:
                            _err('missing plausibility')
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1  # take any plus or minus sign
                        while np < ld:  # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print ( 'np=' , np )
                        if np == 0:
                            _err('missing plausibility')
                        pb = d[:np]  # plausibility bias
                        #                       print ( 'pb=' , pb )
                        d = d[np:]
                        ld = len(d)
                        #                       print ( '2:d=[' + d + ']' )
                        if ld > 1:  # any more to process?
                            c = d[0]  # get next char after bias
                            d = d[1:]  # advance scan
                            ld -= 1
                            if c == '/':  # check for explicit concept
                                #                               print ( 'getting concept' )
                                np = 0
                                while np < ld:  # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                cn = d[:np]  # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()  # signal bad format
                        elif ld > 0:
                            _err()  # unidentifiable trailing text
                    elif d[0] != '(':
                        dd = d
                        while ellyChar.isLetterOrDigit(dd[0]):
                            dd = dd[1:]
                        if len(dd) == 0 or dd[0] != '=':
                            _err()

                d = d.strip()  # rest of definition
                #               print ( 'rest of d=' , d )
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')

                ld = []  # for normalizing definition

                k = 0  # count spaces removed
                sd = ''  # previous char seen
                for cd in d:  # scan all chars in translation
                    #                   print ( 'cd=' , cd )
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':  # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'  # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print('** WARNING \'=\' followed by \'=\'',
                              file=sys.stderr)
                        print('*  at [', tsave, ']', file=sys.stderr)

                    sd = cd
                    ld.append(cd)  # add char to reformatted definition

#               print ( 'ld=' , ld )
                if k > 0:
                    d = ''.join(ld)  # definition with spaces removed

#               print ( '3:d=[' + d + ']' )

                vrc = [t, '=:', cat, syf, smf, pb, cn]  # start data record
                vss = ' '.join(vrc)  # convert to string
                vss += ' ' + d  # fill out record with rest of input
#               print ( 'type(vss)=' , type(vss) )

#               print ( 'rec=' , vrc , 'tra=' , d )
#               print ( '   =' , vss )

            except ellyException.FormatFailure:  # will catch exceptions from _err()
                print('*  at [', tsave, end=' ', file=sys.stderr)
                if dsave != None:
                    print(':', dsave, end=' ', file=sys.stderr)
                print(']', file=sys.stderr)
                continue  # skip rest of processing this rule

#### SQLite DB operation
####
            try:
                sql = "INSERT INTO Vocab VALUES(?,?)"
                #               print ( type(wky) , wky , type(vss) , vss )
                cur.execute(sql, (wky, vss))
            except dbs.Error as e:
                print('FATAL', e, file=sys.stderr)
                sys.exit(1)
#
####

#### SQLite DB operations
####
        if nerr == 0:
            cdb.commit()
        cdb.close()  # clean up


#       print ( 'DONE' )
#
####

    except Error as e:  # catch any other errors
        print('**', e, file=sys.stderr)
        print('*  at', r, file=sys.stderr)
        nerr += 1

    if nerr > 0:
        print('**', nerr, 'vocabulary table errors in all', file=sys.stderr)
        print('*  compilation FAILed', file=sys.stderr)
        cdb.close()  # discard any changes
        raise ellyException.TableFailure
Exemplo n.º 36
0
def scan ( buffr ):

    """
    recognize personal names in text at current position

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    def doLook ( mth , itm ):

        """
        do lookup with specified method using
        global variables in Python 2.7.*

        arguments:
            mth  - name table method
            itm  - string to look up
        """

        global _typ , _nch            # really need nonlocal
        _typ = mth(itm)
        if _typ < 0 and len(itm) > 3: # if no match, check for final '.'
            if itm[-1] == '.':
                _typ = mth(itm[:-1])
                if _typ >= 0:
                    _nch -= 1         # match without '.'

    global _typ , _nch
    global _toscan

#   print 'table=' , _table
    bln = len(buffr)
    if _table == None or bln < 2: return 0
    if _toscan > 0:
        if bln > _toscan:
            return 0
        else:
            _toscan = 0

    chx = buffr[0]
#   print 'scan chx=' , chx
    if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0

    cmps = [ ]                                 # name components this time
    ncmp = 0                                   # number of components for current name
    ninf = 0                                   # number inferred
    ntyp = len(nameTable.TYP)
    stat = [False]*ntyp                        # define state for getting personal name
    mlen = 0                                   # last match length

    bix = 0                                    # buffer index to advance in scanning
    _typ = -1
    while bix < bln:
        ltyp = -1                              # last match type
        _nch = _limit(buffr[bix:],mlen)        # length of next possible name component
#       print 'top _nch=' , _nch
        if _nch == 0: return 0
        elm = _extract(buffr[bix:],_nch)       # get possible component as string
        sch = buffr[bix]
        enclosed = (sch == '(' or sch == '"')  # type of next element
        doLook(_table.lookUp,elm)              # look it up in saved name table
#       print 'lookUp(' , elm , ')=' , _typ

        if _typ < 0:
            if _typ == nameTable.REJ:
                return 0                       # immediate rejection of any match
            if _typ == nameTable.STP:
                break                          # stop any more matching
            if elm[-1] == '.':                 # drop any trailing '.'
                elm = elm[:-1]
                if not enclosed:
                    _nch -= 1
            if enclosed:                       # enclosed element assumed to be name
                if not elm in _cntxt:
                    _cntxt.append(elm)         # make sure always to save in local context
                    ninf += 1                  # this is inferred!
            if elm in _cntxt:
                _typ = nameTable.XNM           # neutral name type to be noncommital

        if _typ < 0:
            tok = buffr[bix:bix + _nch]        # unknown token to check
#           print 'call infer with tok=' , tok
            if infer(tok):
#               print 'digraph test passed'
                _typ = nameTable.XNM           # neutral name type inferred
                if not _table.checkPhonetic(tok):
                    ninf += 1                  # count inferred component if no phonetic support
#           print '_typ=' , _typ

        if nameTable.starts(_typ) and bix > 0: # if component not at start of name,
            break                              #     must stop name scan

#       print 'continuing bix=' , bix
        while _typ >= 0:                       # continue as long as match is viable
            ncmp += 1                          # count up component
            cmps.append(elm)                   # save component
            bix += _nch                        # move ahead in scan
#           print 'bix=' , bix
            if _typ > 0:
#               print '_typ=' , _typ
                if stat[_typ]:                 # check for duplication of component type
                    if (ltyp >= 0 and
                        ltyp != _typ):         # allowed only if duplicate is consecutive
                        break
                mlen = bix                     # save index on actual match
                ltyp = _typ

            if nameTable.ends(_typ):           # if component marks end of name,
                break                          #    must stop name scan

            stat[_typ] = True                  # update match state
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component

            _nch = _limit(buffr[bix:],mlen)    # length of next possible name component
            if _nch == 0: break
            elm = _extract(buffr[bix:],_nch)   # get possible next component as string
            doLook(_table.lookUpMore,elm)      # look it up in saved name table
#           print 'lookUpMore(' , elm , ')=' , _typ

        if _typ < 0:                           # while-loop terminated without break
#           print 'ltyp=' , ltyp , 'mlen=' , mlen
            if ltyp < 0 or mlen == 0: break
            bix = mlen                         # restart at end of last match
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component
            continue

        break

#
#
#### additional constraints on acceptable personal name
#
#   print 'checking ltyp=' , ltyp
    if (ltyp == nameTable.CNJ or
        ltyp == nameTable.REL):                # a name cannot end with these types
        mlen -= _nch                           # have to drop them from any match
        if mlen == 0: return 0
        if ellyChar.isWhiteSpace(buffr[mlen-1]):
            mlen -= 1
        ncmp -= 1
        cmps.pop()

#   print 'ncmp=' , ncmp

    if ncmp == 0:                              # nothing matched?
        _planAhead(buffr)                      # check for possible problems in next scan
        return 0

#   print 'cmps=' , cmps
    if ncmp == ninf:
        return 0                               # name cannot be purely inferred

#   print 'ncmp=' , ncmp
    if ncmp == 1:                              # single-component name must be known or contextual
        if (not stat[nameTable.SNG] and
            not cmps[0] in _cntxt):
            return 0

#   print 'stat=' , stat[3:7]
    expl = (stat[nameTable.PNM] or             # name must have a substantial component
            stat[nameTable.SNM] or
            stat[nameTable.XNM] or
            stat[nameTable.SNG])

#   print 'expl=' , expl
    if (not expl and
        not (stat[nameTable.TTL] and           # or it could have just a title
             stat[nameTable.INI])):            #    and an initial
        return 0
#
####

#   print 'accepted mlen=' , mlen
    for cmpo in cmps:                          # if whole name is OK,
        if not cmpo in _cntxt:                 #    remember all components
            _cntxt.append(cmpo)                #    not already listed in context

    return mlen                                # will be > 0 on successful match
Exemplo n.º 37
0
def compile ( name , stb , defn ):

    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new SQLite database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0
    cdb = None  # SQLite db connection
    cur = None  # SQLite db cursor

#   print 'compiled stb=' , stb

    if stb == None :
        print >> sys.stderr, 'no symbol table'
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False)
    except ellyException.FormatFailure:              # should never need this
        print >> sys.stderr , 'unexpected failure with zero features'
        raise ellyException.TableFailure

#   print 'zfs=' , zfs               # hexadecimal for all features off

    tsave = ''                                       # original term
    dsave = ''                                       #          definition

    try:
        filn = name + vocabulary                     # where to put vocabulary database
        try:
            os.remove(filn)                          # delete the file if it exists
        except OSError:
            print >> sys.stderr , 'no' , filn        # if no such file, warn but proceed

#### SQLite
####
        try:
            cdb = dbs.connect(filn)                  # create new database
            cur = cdb.cursor()
            cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)")
            cdb.commit()
        except dbs.Error , e:
            print >> sys.stderr , e
            raise ellyException.TableFailure         # give up on any database failure

#       print 'creating' , filn
#
####

        r = None                                          # for error reporting

        while True:                                       # process vocabulary definition records

            try:                                          # for catching FormatFailure exception
#               print '------------'
                r = defn.readline()                       # next definition
                if len(r) == 0: break                     # stop on EOF
#               print type(r) , r

                k = r.find(':')                           # look for first ':'
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()                                # report error and quit entry

                t = r[:k].strip()                         # term to go into dictionary
                d = r[k+1:].strip()                       # its definition
                tsave = t                                 # save for any error reporting
                dsave = d                                 #

#               print ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>'
                if len(t) == 0 or len(d) == 0:
                    _err()                                # quit on missing parts
                c = t[0]
                if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"':
                    _err('bad term')

                n = delimitKey(t)                         # get part of term to index
                if n <= 0:
                    _err()                                # quit on bad term
                wky = toKey(t[:n])                        # key part of term to define
#               print '  SQLite key=' , wky

                ns = syntaxSpecification.scan(d)          # find extent of syntax info
#               print 'ns=' , ns
                if ns <= 0: _err('bad syntax specification')
#               print 'PoS=' , d[:ns]

                syn = d[:ns]                              # syntax info as string
                d = d[ns:].strip()                        # rest of definition

                try:
#                   print 'VT syn=' , syn
                    ss = SSpec(stb,syn)                   # decode syntax info
#                   print 'VT ss =' , ss
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                cat = str(ss.catg)                        #   syntax category
                syf = ss.synf.positive.hexadecimal(False) #   syntactic flags
#               print 'syf=' , syf

                smf = zfs                                 # initialize defaults for
                pb = '0'                                  #   cognitive semantics
                cn = conceptualHierarchy.NOname           #

#               print '0:d=[' + d + ']'
                if len(d) > 1:                            # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':                      # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                            d = d[2:].strip()             # skip over
                        else:
                            ns = featureSpecification.scan(d) # look for ']' of features
#                           print 'ns=' , ns
                            if ns < 0:
                                _err()
                            sem = d[:ns]                  # get semantic features
                            d = d[ns:].strip()            # skip over
                            try:
#                               print 'smf=' , smf
                                fs = FSpec(stb,sem,True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                            smf = fs.positive.hexadecimal(False) # convert to hex

#                       print '1:d=[' + d + ']'
                        ld = len(d)
#                       print 'ld=' , ld
                        if ld == 0:
                            _err('missing plausibility')
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1                       # take any plus or minus sign
                        while np < ld:                    # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print 'np=' , np
                        if np == 0:
                            _err('missing plausibility')
                        pb = d[:np]                       # plausibility bias
#                       print 'pb=' , pb
                        d = d[np:]
                        ld = len(d)
#                       print '2:d=[' + d + ']'
                        if ld > 1:                        # any more to process?
                            c = d[0]                      # get next char after bias
                            d = d[1:]                     # advance scan
                            ld -= 1
                            if c == '/':                  # check for explicit concept
#                               print 'getting concept'
                                np = 0
                                while np < ld:            # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                cn = d[:np]               # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()                    # signal bad format
                        elif ld > 0:
                            _err()                        # unidentifiable trailing text

                d = d.strip()                             # rest of definition
#               print 'rest of d=' , d
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')

                ld = [ ]                            # for normalizing definition

                k = 0                               # count spaces removed
                sd = ''                             # previous char seen
                for cd in d:                        # scan all chars in translation
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':    # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'                    # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print >> sys.stderr , '** WARNING \'=\' followed by \'=\''
                        print >> sys.stderr , '*  at [' , tsave , ']'

                    sd = cd
                    ld.append(cd)                   # add char to reformatted definition

                if k > 0:
                    d = ''.join(ld)                 # definition with spaces removed

#               print '3:d=[' + d + ']'

                vrc = [ t , ':' , cat , syf , smf ,
                        pb , cn ]                   # start data record
                vss = u' '.join(vrc)                # convert to string
                vss += u' ' + d                     # fill out record with rest of input
#               print 'type(vss)=' , type(vss)

#               print 'rec=' , vrc , 'tra=' , d
#               print '   =' , vss

            except ellyException.FormatFailure:
                print >> sys.stderr , '*  at [' , tsave ,
                if dsave != None:
                    print >> sys.stderr , ':' , dsave ,
                print >> sys.stderr , ']'
                continue                            # skip rest of processing

#### SQLite
####
            try:
                sql = "INSERT INTO Vocab VALUES(?,?)"
#               print type(wky) , wky , type(vss) , vss
                cur.execute(sql,(wky,vss))
            except dbs.Error , e:
                print >> sys.stderr , 'FATAL' , e
                sys.exit(1)
Exemplo n.º 38
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset
Exemplo n.º 39
0
    def _reload ( self ):

        """
        refill input line buffer and compute indentation

        arguments:
            self

        returns:
            True on success if buffer has at least one char, False otherwise
        """

#       print '_reload'
        bex = ''                         # save space char at end of buffer
        bcn = len(self.buf)
        if bcn > 1:
            return True                  # no refilling needed
        elif bcn == 1:
            if ellyChar.isWhiteSpace(self.buf[0]):
                bex = self.buf[0]        # special case when only space char left
                self.buf = [ ]           # refill to get chars after that space
            else:
                return True              # no refilling yet

        if self._eof:
            return False                 # must return immediately on previous EOF

        while len(self.buf) == 0:

#           print 'get more text'

            try:                         # read in UTF8 line from input stream
                if self._prmpt: sys.stdout.write('>> ')
                s = self.inp.readline()  # new text line to add
#               print 's=' , s
                if len(s) == 0:
#                   print '**EOF'
                    self._eof = True
                    return False         # EOF
                s = s.decode('utf8')     # convert UTF8 to Unicode string
#               print 'raw s=' , s
            except IOError:
                print >> sys.stderr , '** char stream ERROR'
                return False             # treat read failure as empty line

            k = 0
            while k < len(s):            # count leading white space chars
                if s[k] == NL: break     # but stop at end of line
                if not ellyChar.isWhiteSpace(s[k]): break
                k += 1
            self._in = k                 # save indentation level
            s = s[k:]
            self.buf = list(s)           # put unindented text into buffer
#           print 'k=' , k , ', s=' , '"' + s + '"'
#           print self.buf
            if k > 0 and ellyConfiguration.noteIndentation:
                self.buf.insert(0,NL)    # if noted, indentation will break sentence

#           print 'len=' , len(self.buf)
            if len(self.buf) > 0:        # if usable input, stop filling
                if bex != '':            # but restore any saved space char from buffer
                    self.buf.insert(0,bex)
                return True

        return False                     # cannot refill, ignore trailing space char
Exemplo n.º 40
0
def _limit ( buffr , hstry ):

    """
    get length of next possible name component in buffer

    arguments:
        buffr - list of chars
        hstry - how much matched already

    returns:
        number of chars in continuation of last component, 0 for no next component
    """

    lnb = len(buffr)
    if lnb == 0: return 0

    bix = 0
    quot = False                           # indicate component starting with "
    parn = False                           #                             with (
    cmma = False                           #                             with ,
#   print '_limit buffr=' , buffr , 'hstry=' , hstry
    if buffr[0] == ',':                    # handle possible leading comma
        if hstry == 0 or lnb < 4: return 0
        bix += 1
        if ellyChar.isWhiteSpace(buffr[1]):
            bix += 1
        cmma = True
#       print 'for comma, bix=' , bix

    if buffr[bix] == '(':                  # handle short name in parentheses
        bix += 1
        parn = True
    if buffr[bix] == '"':                  # handle short name in double quotes
        bix += 1
        quot = True
#       print 'parn=' , parn , 'quot=' , quot
    if parn or quot:
#       print 'enclosed component from' , buffr[bix:]
        while bix < lnb:                   # collect letters for name
            chx = buffr[bix]
            if ellyChar.isWhiteSpace(chx):
                break
            elif not quot and parn and chx == ')':
                return bix + 1             # add trailing parenthesis
            elif quot and chx == '"':
                if bix + 1 < lnb and parn and buffr[bix+1] == ')':
                    return bix + 2         # add trailing quote and parenthesis
                elif not parn:
                    return bix + 1         # add trailing quote only
                else:
                    return 0               # no match
            elif chx == '.':
                return bix + 1             # add trailing period
            elif not ellyChar.isLetter(chx):
                break                      # unrecognizable char for name
            bix += 1
#       print 'no closure'
        return 0
    else:
#       print 'find component in' , buffr[bix:]
        while bix < lnb:
            chx = buffr[bix]               # collect letters for name
#           print 'chx=' , chx
            if chx == "'":
                if bix + 2 < lnb:
                    chn = buffr[bix+1]
                    if ellyChar.isWhiteSpace(chn):
                        break
                    if chn == 's' and not ellyChar.isLetter(buffr[bix+2]):
                        break
            elif not ellyChar.isLetter(chx):
                if chx == '.':
                    bix += 1
#                   print 'increment bix=' , bix
                break
            bix += 1

        if bix == lnb:

#           print 'ran out of chars'
            return bix                     # running out of chars means match

        else:

#           getting here means that more text follows limit
#           and so we may have to pick up extra chars here

            chx = buffr[bix]
#           print 'next chx=' , chx , 'bix=' , bix
            if ellyChar.isWhiteSpace(chx) or chx == "'":
                return bix                 # component can be terminated by space or (')
            elif chx == ',':
                if cmma:
                    return bix + 1         #     or comma when sequence starts with comma
                else:
                    return bix             #              when there is no starting comma
            elif ellyChar.isLetter(chx):
                return bix                 #     or letter, implying previous char was '.'
            else:
                return 0                   # failure to find name limit
Exemplo n.º 41
0
    def _matchAN ( self , ts ):

        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)            # look for month to start date string
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1                     # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)          # look for day of month
            if k == 0: return 0
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0: return 0
            if t[0] == u',': t = t[1:] # look for comma after day
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)         # look for year
            return tl - len(t) + k
        else:
            k = self._aDay(t)          # look for day of month to start date string
            if k == 0 or k == tl: return 0
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
#           print 'new t=' , t
            if (k > 2 and len(t) > 2 and
                t[0] == u' ' and
                t[1].upper() == 'O' and
                t[2].upper() == 'F'):
                t = t[3:]              # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)        # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
#           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':           # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)         # look for year
            if k > 0:
                return ntl + k + nd    # full date found
            else:
                return ntl - nd        # only month and day of date found
Exemplo n.º 42
0
def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset
Exemplo n.º 43
0
def scan ( buffr ):

    """
    recognize personal names in text at current position

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    def doLook ( mth , itm ):

        """
        do lookup with specified method using
        global variables in Python 2.7.*

        arguments:
            mth  - name table method
            itm  - string to look up
        """

        global _typ , _nch            # really need nonlocal
        _typ = mth(itm)
        if _typ < 0 and len(itm) > 3: # if no match, check for final '.'
            if itm[-1] == '.':
                _typ = mth(itm[:-1])
                if _typ >= 0:
                    _nch -= 1         # match without '.'

    global _typ , _nch
    global _toscan

#   print ( 'table=' , _table )
    bln = len(buffr)
    if _table == None or bln < 2: return 0
    if _toscan > 0:
        if bln > _toscan:
            return 0
        else:
            _toscan = 0

    chx = buffr[0]
#   print ( 'scan chx=' , chx )
    if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0

    cmps = [ ]                                 # name components this time
    ncmp = 0                                   # number of components for current name
    ninf = 0                                   # number inferred
    ntyp = len(nameTable.TYP)
    stat = [False]*ntyp                        # define state for getting personal name
    mlen = 0                                   # last match length

    bix = 0                                    # buffer index to advance in scanning
    _typ = -1
    while bix < bln:
        ltyp = -1                              # last match type
        _nch = _limit(buffr[bix:],mlen)        # length of next possible name component
#       print ( 'top _nch=' , _nch )
        if _nch == 0: return 0
        elm = _extract(buffr[bix:],_nch)       # get possible component as string
        sch = buffr[bix]
        enclosed = (sch == '(' or sch == '"')  # type of next element
        doLook(_table.lookUp,elm)              # look it up in saved name table
#       print ( 'lookUp(' , elm , ')=' , _typ )

        if _typ < 0:
            if _typ == nameTable.REJ:
                return 0                       # immediate rejection of any match
            if _typ == nameTable.STP:
                break                          # stop any more matching
            if elm[-1] == '.':                 # drop any trailing '.'
                elm = elm[:-1]
                if not enclosed:
                    _nch -= 1
            if enclosed:                       # enclosed element assumed to be name
                if not elm in _cntxt:
                    _cntxt.append(elm)         # make sure always to save in local context
                    ninf += 1                  # this is inferred!
            if elm in _cntxt:
                _typ = nameTable.XNM           # neutral name type to be noncommital

        if _typ < 0:
            tok = buffr[bix:bix + _nch]        # unknown token to check
#           print ( 'call infer with tok=' , tok )
            if infer(tok):
#               print ( 'digraph test passed' )
                _typ = nameTable.XNM           # neutral name type inferred
                if not _table.checkPhonetic(tok):
                    ninf += 1                  # count inferred component if no phonetic support
#           print ( '_typ=' , _typ )

        if nameTable.starts(_typ) and bix > 0: # if component not at start of name,
            break                              #     must stop name scan

#       print ( 'continuing bix=' , bix )
        while _typ >= 0:                       # continue as long as match is viable
            ncmp += 1                          # count up component
            cmps.append(elm)                   # save component
            bix += _nch                        # move ahead in scan
#           print ( 'bix=' , bix )
            if _typ > 0:
#               print ( '_typ=' , _typ )
                if stat[_typ]:                 # check for duplication of component type
                    if (ltyp >= 0 and
                        ltyp != _typ):         # allowed only if duplicate is consecutive
                        break
                mlen = bix                     # save index on actual match
                ltyp = _typ

            if nameTable.ends(_typ):           # if component marks end of name,
                break                          #    must stop name scan

            stat[_typ] = True                  # update match state
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component

            _nch = _limit(buffr[bix:],mlen)    # length of next possible name component
            if _nch == 0: break
            elm = _extract(buffr[bix:],_nch)   # get possible next component as string
            doLook(_table.lookUpMore,elm)      # look it up in saved name table
#           print ( 'lookUpMore(' , elm , ')=' , _typ )

        if _typ < 0:                           # while-loop terminated without break
#           print ( 'ltyp=' , ltyp , 'mlen=' , mlen )
            if ltyp < 0 or mlen == 0: break
            bix = mlen                         # restart at end of last match
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component
            continue

        break

#
#
#### additional constraints on acceptable personal name
#
#   print ( 'checking ltyp=' , ltyp )
    if (ltyp == nameTable.CNJ or
        ltyp == nameTable.REL):                # a name cannot end with these types
        mlen -= _nch                           # have to drop them from any match
        if mlen == 0: return 0
        if ellyChar.isWhiteSpace(buffr[mlen-1]):
            mlen -= 1
        ncmp -= 1
        cmps.pop()

#   print ( 'ncmp=' , ncmp )

    if ncmp == 0:                              # nothing matched?
        _planAhead(buffr)                      # check for possible problems in next scan
        return 0

#   print ( 'cmps=' , cmps )
    if ncmp == ninf:
        return 0                               # name cannot be purely inferred

#   print ( 'ncmp=' , ncmp )
    if ncmp == 1:                              # single-component name must be known or contextual
        if (not stat[nameTable.SNG] and
            not cmps[0] in _cntxt):
            return 0

#   print ( 'stat=' , stat[3:7] )
    expl = (stat[nameTable.PNM] or             # name must have a substantial component
            stat[nameTable.SNM] or
            stat[nameTable.XNM] or
            stat[nameTable.SNG])

#   print ( 'expl=' , expl )
    if (not expl and
        not (stat[nameTable.TTL] and           # or it could have just a title
             stat[nameTable.INI])):            #    and an initial
        return 0
#
####

#   print ( 'accepted mlen=' , mlen )
    for cmpo in cmps:                          # if whole name is OK,
        if not cmpo in _cntxt:                 #    remember all components
            _cntxt.append(cmpo)                #    not already listed in context

    return mlen                                # will be > 0 on successful match
Exemplo n.º 44
0
    def match ( self , txt , pnc , nxt ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars up to and including punctuation char
            pnc   - punctuation char
            nxt   - single char after punctuation

        returns:
            True on match, False otherwise
        """

#       print 'matching for txt=' , txt , 'pnc=' , pnc , 'nxt=' , nxt

#       print 'lstg=' , self.lstg
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

#       print len(lp) , 'patterns'

        txl = txt[-self.maxl:] if len(txt) > self.maxl else txt

        txs = map(lambda x: x.lower(),txl) # actual left context for matching

        lt = len(txs)             # its length

#       print 'txs= ' + unicode(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        for p in lp:              # try matching each pattern

            if p.left != None:

                n = len(p.left)   # assume each pattern element must match one sequence char
#               print 'n=' , n , 'p=' , unicode(p)
                if n > lt:
                    continue      # fail immediately because of impossibility of match
                t = txs if n == lt else txs[-n:]
#               print 'left pat=' , '[' + ellyWildcard.deconvert(p.left) + ']'
#               print 'versus t=' , t
                if not ellyWildcard.match(p.left,t,0):
#                   print 'no left match'
                    continue
                if n < lt and ellyChar.isLetterOrDigit(t[0]):
                    if ellyChar.isLetterOrDigit(txs[-n-1]):
                        continue  # fail because of no break in text

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           print 'versus c=' , nc

            if p.right == []:
                return True
            pcx = p.right[0]
            if pcx == nxt:                     # check for specific char after possible stop
#               print 'right=' , nxt
                return True
            if pcx == ellyWildcard.cCAN:       # check for nonalphanumeric
                if nxt == u'' or not ellyChar.isLetterOrDigit(nxt):
#                   print 'right nonalphanumeric=' , nxt
                    return True
            if pcx == ellyWildcard.cSPC:       # check for white space
#               print 'looking for space'
                if nxt == u'' or nxt == u' ' or nxt == u'\n':
#                   print 'right space'
                    return True
#           print 'last check'
            if p.right == u'.':                # check for any punctuation
                if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt):
#                   print 'right punc=' , nxt
                    return True

        return False
Exemplo n.º 45
0
    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        sent = [ ]         # list buffer to fill

        parenstop = False  # initially, parentheses will NOT stop sentence

        c = self.inp.read()
        if c == SP:
            c = self.inp.read()

        if c == END:       # EOF check
            return None

#       print 'c=' , ord(c)
        self.inp.unread(c,SP)
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation

        nAN = 0            # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>'
#           print 'sent=' , sent

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            #########################################
            # accumulate chars and count alphanumeric
            #########################################

            c = x
            sent.append(c)
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            z = self.inp.peek()  # for context of match call
#           print 'peek z=' , z

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , '<' + z + '>'
            if self.stpx.match(sent[:-1],c,z):
#               print 'exception MATCH'
                if self.drop:
                    sent.pop()   # remove punctuation char from sentence
                continue

#           print '1  <<' , self.inp.buf

#           print 'no exception MATCH'

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '2  <<' , self.inp.buf

            # handle parentheses as possible stop

            if nAN == 0 and self.stpx.inBracketing():
                parenstop = True
            elif parenstop and not self.stpx.inBracketing():
                break            # treat as stop

#           print '3  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

            if not c in Stops:
                continue

            else:
#               print 'stopping possible!'
                d = self.inp.read()
#               print '4  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(c)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(' ') # if part of token, put in space as separator
                    continue

                # special check for multiple stops

#               print 'Stops d=' , d , ord(d) if d != '' else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = u' '

                # break sentence except when in parentheses

                elif d in RBs:
#                   print 'followed by' , '<' + d + '>'
                    if not self.stpx.inBracketing():
                        break
                    else:
                        if self.drop:
                            sent.pop()
                        self.inp.unread(d)
                        continue

                # special check for single or double quotes, which should
                # be included with current sentence after stop punctuation

                elif d in QUOs:
#                   print 'QUO d=' , d , ord(d)
                    x = self.inp.peek()
                    if x == END or ellyChar.isWhiteSpace(x):
                        sent.append(d)
                        break
                    else:
                        self.inp.unread(SP)
                        continue

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    sent.append(d)
                    continue

                # if no match for lookahead, put back

                elif d != '':
#                   print 'unread d=' , d
                    self.inp.unread(d)

                # final check: is sentence long enough?

#               print '5  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
                if nAN > 1:
                    break

        if len(sent) > 0 or self.last != END:
            return sent
        else:
            return None