Exemplo n.º 1
0
def transitions(distance) :
    transition_states = possibleStates.genTransitions(distance)

    new_states = {}
    for i, states in enumerate(transition_states) :
        cv_dict = {}
        for cv, state_type in states.iteritems() :
            cv_s = bitShift(eval(cv))
            for k, v in state_type.items() :
                if k in cv_dict :
                    cv_dict[k].update({cv_s : v})
                else :
                    cv_dict[k] = {cv_s : v}
            new_states[i] = cv_dict

    return new_states
def main():

  if len(sys.argv) != 3:
    print
    print 'Usage: python -u %s N <True/False>' % sys.argv[0]
    print
    print 'NOTE: the resulting .java file is created in the current working dir!'
    print
    sys.exit(1)

  n = int(sys.argv[1])

  transpose = (sys.argv[2] == "True")

  tables = genTransitions(n, transpose)

  stateMap = {}

  # init null state
  stateMap['[]'] = -1

  # init start state
  stateMap['[(0, 0)]'] = 0

  w = LineOutput()

  w('package org.apache.lucene.util.automaton;')
  w('')
  w('/*')
  w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
  w(' * contributor license agreements.  See the NOTICE file distributed with')
  w(' * this work for additional information regarding copyright ownership.')
  w(' * The ASF licenses this file to You under the Apache License, Version 2.0')
  w(' * (the "License"); you may not use this file except in compliance with')
  w(' * the License.  You may obtain a copy of the License at')
  w(' *')
  w(' *     http://www.apache.org/licenses/LICENSE-2.0')
  w(' *')
  w(' * Unless required by applicable law or agreed to in writing, software')
  w(' * distributed under the License is distributed on an "AS IS" BASIS,')
  w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.')
  w(' * See the License for the specific language governing permissions and')
  w(' * limitations under the License.')
  w(' */')
  w('')
  w('// The following code was generated with the moman/finenight pkg')
  w('// This package is available under the MIT License, see NOTICE.txt')
  w('// for more details.')
  w('')
  w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
  w('')
  if transpose:
    w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
    w('    with transpositions as primitive edits */')
    className = 'Lev%dTParametricDescription' % n
  else:
    w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
    className = 'Lev%dParametricDescription' % n

  w('class %s extends ParametricDescription {' % className)

  w('')
  w('@Override')
  w('int transition(int absState, int position, int vector) {')

  w('  // null absState should never be passed in')
  w('  assert absState != -1;')

  w('')
  w('  // decode absState -> state, offset')
  w('  int state = absState/(w+1);')
  w('  int offset = absState%(w+1);')
  w('  assert offset >= 0;')
  w('')  

  machines = []
  
  for i, map in enumerate(tables):
    if i == 0:
      w('if (position == w) {')
    elif i == len(tables)-1:
      w('} else {')
    else:
      w('} else if (position == w-%d) {' % i)

    if i != 0 and MODE == 'switch':
      w('switch(vector) {')

    l = map.items()
    l.sort()

    numCasesPerVector = None
    numVectors = len(l)

    if MODE == 'array':
      toStateArray = []
      toOffsetIncrArray = []

    for charVar, states in l:

      # somehow it's a string:
      charVar = eval(charVar)

      if i != 0 and MODE == 'switch':
        w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar])))
        w.indent()
        
      l = states.items()

      byFromState = {}

      # first pass to assign states
      byAction = {}
      for s, (toS, offset) in l:
        state = str(s)
        
        toState = str(toS)
        if state not in stateMap:
          stateMap[state] = len(stateMap)-1
        if toState not in stateMap:
          stateMap[toState] = len(stateMap)-1

        byFromState[stateMap[state]] = (1+stateMap[toState], offset)

        fromStateDesc = s[1:len(s)-1]
        toStateDesc = ', '.join([str(x) for x in toS])   

        tup = (stateMap[toState], toStateDesc, offset)
        if tup not in byAction:
          byAction[tup] = []
        byAction[tup].append((fromStateDesc, stateMap[state]))

      if numCasesPerVector is None:
        numCasesPerVector = len(l)
      else:
        # we require this to be uniform... empirically it seems to be!
        assert numCasesPerVector == len(l)

      if MODE == 'array':

        for s in range(numCasesPerVector):
          toState, offsetIncr = byFromState[s]
          toStateArray.append(toState)
          toOffsetIncrArray.append(offsetIncr)

      else:

        # render switches
        w('switch(state) {   // %s cases' % len(l))

        for (toState, toStateDesc, offset), lx in byAction.items():
          for fromStateDesc, fromState in lx:
            w('case %s: // %s' % (fromState, fromStateDesc))
          w.indent()
          w('  state = %s; // %s' % (toState, toStateDesc))
          if offset > 0:
            w('  offset += %s;' % offset)
          w('break;')
          w.outdent()

        w('}')
        if i != 0:
          w('break;')
          w.outdent()

    if MODE == 'array':
      # strangely state can come in wildly out of bounds....
      w('  if (state < %d) {' % numCasesPerVector)
      w('    final int loc = vector * %d + state;' % numCasesPerVector)
      if PACKED:
        w('    offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i))
        w('    state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i))
      else:
        w('    offset += offsetIncrs%d[loc];' % i)
        w('    state = toStates%d[loc]-1;' % i)
      w('  }')
    elif i != 0:
      w('}')

    machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors))

  # ends switch statement for machine
  w('}')

  w('')

  w('  if (state == -1) {')
  w('    // null state')
  w('    return -1;')
  w('  } else {')
  w('    // translate back to abs')
  w('    return state*(w+1)+offset;')
  w('  }')

  # ends transition method
  w('}')

  subs = []
  if MODE == 'array':
    w.indent()
    for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
      w('')
      w.outdent()
      w('// %d vectors; %d states per vector; array length = %d' % \
        (numVectors, numCasesPerVector, numVectors*numCasesPerVector))
      w.indent()
      if PACKED:
        # pack in python
        l, nbits = pack(toStateArray)
        subs.append(('NBITSSTATES%d' % i, str(nbits)))
        w('  private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \
          (i, nbits, renderList([hex(long(x)) for x in l])))

        l, nbits = pack(toOffsetIncrsArray)
        subs.append(('NBITSOFFSET%d' % i, str(nbits)))
        w('  private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \
          (i, nbits, renderList([hex(long(x)) for x in l])))
      else:
        w('  private final static int[] toStates%d = new int[] %s;' % \
          (i, renderList([str(x) for x in toStateArray])))
        w('  private final static int[] offsetIncrs%d = new int[] %s;' % \
          (i, renderList([str(x) for x in toStateArray])))
    w.outdent()
  
  stateMap2 = dict([[v,k] for k,v in stateMap.items()])
  w('')
  w('// state map')
  sum = 0
  minErrors = []
  for i in xrange(len(stateMap2)-1):
    w('//   %s -> %s' % (i, stateMap2[i]))
    # we replace t-notation as its not relevant here
    st = stateMap2[i].replace('t', '')
    
    v = eval(st)
    minError = min([-i+e for i, e in v])
    c = len(v)
    sum += c
    minErrors.append(minError)
  w('')

  w.indent()
  #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors]))

  w.outdent()

  w('')
  w('  public %s(int w) {' % className)
  w('    super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1)
  w('  }')

  if 0:
    w('')
    w('@Override')
    w('public int size() { // this can now move up?')
    w('  return %d*(w+1);' % (len(stateMap2)-1))
    w('}')

    w('')
    w('@Override')
    w('public int getPosition(int absState) { // this can now move up?')
    w('  return absState % (w+1);')
    w('}')

    w('')
    w('@Override')
    w('public boolean isAccept(int absState) { // this can now move up?')
    w('  // decode absState -> state, offset')
    w('  int state = absState/(w+1);')
    w('  if (true || state < minErrors.length) {')
    w('    int offset = absState%(w+1);')
    w('    assert offset >= 0;')
    w('    return w - offset + minErrors[state] <= %d;' % n)
    w('  } else {')
    w('    return false;')
    w('  }')
    w('}')

  if MODE == 'array' and PACKED:

    # we moved into super class
    if False:
      w('')

      v = 2
      l = []
      for i in range(63):
        l.append(hex(v-1))
        v *= 2

      w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1)
      w('')

      # unpack in java
      w('private int unpack(long[] data, int index, int bitsPerValue) {')
      w('  final long bitLoc = bitsPerValue * index;')
      w('  final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD)
      w('  final int bitStart = (int) (bitLoc & %d);' % (WORD-1))
      w('  //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);')
      w('  if (bitStart + bitsPerValue <= %d) {' % WORD)
      w('    // not split')
      w('    return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);')
      w('  } else {')
      w('    // split')
      w('    final int part = %d-bitStart;' % WORD)
      w('    return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +')
      w('      ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1)
      w('  }')
      w('}')
  
  # class
  w('}')
  w('')

  fileOut = '%s.java' % className

  s = str(w)
  for sub, repl in subs:
    s = s.replace(sub, repl)

  open(fileOut, 'wb').write(s)

  print 'Wrote %s [%d lines; %.1f KB]' % \
        (fileOut, len(w.l), os.path.getsize(fileOut)/1024.)
def main():

  if len(sys.argv) != 3:
    print
    print 'Usage: python -u %s N <True/False>' % sys.argv[0]
    print
    print 'NOTE: the resulting .java file is created in the current working dir!'
    print
    sys.exit(1)

  n = int(sys.argv[1])

  transpose = (sys.argv[2] == "True")

  tables = genTransitions(n, transpose)

  stateMap = {}

  # init null state
  stateMap['[]'] = -1

  # init start state
  stateMap['[(0, 0)]'] = 0

  w = LineOutput()

  w('package org.apache.lucene.util.automaton;')
  w('')
  w('/*')
  w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
  w(' * contributor license agreements.  See the NOTICE file distributed with')
  w(' * this work for additional information regarding copyright ownership.')
  w(' * The ASF licenses this file to You under the Apache License, Version 2.0')
  w(' * (the "License"); you may not use this file except in compliance with')
  w(' * the License.  You may obtain a copy of the License at')
  w(' *')
  w(' *     http://www.apache.org/licenses/LICENSE-2.0')
  w(' *')
  w(' * Unless required by applicable law or agreed to in writing, software')
  w(' * distributed under the License is distributed on an "AS IS" BASIS,')
  w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.')
  w(' * See the License for the specific language governing permissions and')
  w(' * limitations under the License.')
  w(' */')
  w('')
  w('// The following code was generated with the moman/finenight pkg')
  w('// This package is available under the MIT License, see NOTICE.txt')
  w('// for more details.')
  w('')
  w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
  w('')
  if transpose:
    w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
    w('    with transpositions as primitive edits */')
    className = 'Lev%dTParametricDescription' % n
  else:
    w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
    className = 'Lev%dParametricDescription' % n

  w('class %s extends ParametricDescription {' % className)

  w('')
  w('@Override')
  w('int transition(int absState, int position, int vector) {')

  w('  // null absState should never be passed in')
  w('  assert absState != -1;')

  w('')
  w('  // decode absState -> state, offset')
  w('  int state = absState/(w+1);')
  w('  int offset = absState%(w+1);')
  w('  assert offset >= 0;')
  w('')  

  machines = []
  
  for i, map in enumerate(tables):
    if i == 0:
      w('if (position == w) {')
    elif i == len(tables)-1:
      w('} else {')
    else:
      w('} else if (position == w-%d) {' % i)

    if i != 0 and MODE == 'switch':
      w('switch(vector) {')

    l = map.items()
    l.sort()

    numCasesPerVector = None
    numVectors = len(l)

    if MODE == 'array':
      toStateArray = []
      toOffsetIncrArray = []

    for charVar, states in l:

      # somehow it's a string:
      charVar = eval(charVar)

      if i != 0 and MODE == 'switch':
        w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar])))
        w.indent()
        
      l = states.items()

      byFromState = {}

      # first pass to assign states
      byAction = {}
      for s, (toS, offset) in l:
        state = str(s)
        
        toState = str(toS)
        if state not in stateMap:
          stateMap[state] = len(stateMap)-1
        if toState not in stateMap:
          stateMap[toState] = len(stateMap)-1

        byFromState[stateMap[state]] = (1+stateMap[toState], offset)

        fromStateDesc = s[1:len(s)-1]
        toStateDesc = ', '.join([str(x) for x in toS])   

        tup = (stateMap[toState], toStateDesc, offset)
        if tup not in byAction:
          byAction[tup] = []
        byAction[tup].append((fromStateDesc, stateMap[state]))

      if numCasesPerVector is None:
        numCasesPerVector = len(l)
      else:
        # we require this to be uniform... empirically it seems to be!
        assert numCasesPerVector == len(l)

      if MODE == 'array':

        for s in range(numCasesPerVector):
          toState, offsetIncr = byFromState[s]
          toStateArray.append(toState)
          toOffsetIncrArray.append(offsetIncr)

      else:

        # render switches
        w('switch(state) {   // %s cases' % len(l))

        for (toState, toStateDesc, offset), lx in byAction.items():
          for fromStateDesc, fromState in lx:
            w('case %s: // %s' % (fromState, fromStateDesc))
          w.indent()
          w('  state = %s; // %s' % (toState, toStateDesc))
          if offset > 0:
            w('  offset += %s;' % offset)
          w('break;')
          w.outdent()

        w('}')
        if i != 0:
          w('break;')
          w.outdent()

    if MODE == 'array':
      # strangely state can come in wildly out of bounds....
      w('  if (state < %d) {' % numCasesPerVector)
      w('    final int loc = vector * %d + state;' % numCasesPerVector)
      if PACKED:
        w('    offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i))
        w('    state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i))
      else:
        w('    offset += offsetIncrs%d[loc];' % i)
        w('    state = toStates%d[loc]-1;' % i)
      w('  }')
    elif i != 0:
      w('}')

    machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors))

  # ends switch statement for machine
  w('}')

  w('')

  w('  if (state == -1) {')
  w('    // null state')
  w('    return -1;')
  w('  } else {')
  w('    // translate back to abs')
  w('    return state*(w+1)+offset;')
  w('  }')

  # ends transition method
  w('}')

  subs = []
  if MODE == 'array':
    w.indent()
    for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
      w('')
      w.outdent()
      w('// %d vectors; %d states per vector; array length = %d' % \
        (numVectors, numCasesPerVector, numVectors*numCasesPerVector))
      w.indent()
      if PACKED:
        # pack in python
        l, nbits = pack(toStateArray)
        subs.append(('NBITSSTATES%d' % i, str(nbits)))
        w('  private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \
          (i, nbits, renderList([hex(long(x)) for x in l])))

        l, nbits = pack(toOffsetIncrsArray)
        subs.append(('NBITSOFFSET%d' % i, str(nbits)))
        w('  private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \
          (i, nbits, renderList([hex(long(x)) for x in l])))
      else:
        w('  private final static int[] toStates%d = new int[] %s;' % \
          (i, renderList([str(x) for x in toStateArray])))
        w('  private final static int[] offsetIncrs%d = new int[] %s;' % \
          (i, renderList([str(x) for x in toStateArray])))
    w.outdent()
  
  stateMap2 = dict([[v,k] for k,v in stateMap.items()])
  w('')
  w('// state map')
  sum = 0
  minErrors = []
  for i in xrange(len(stateMap2)-1):
    w('//   %s -> %s' % (i, stateMap2[i]))
    # we replace t-notation as it's not relevant here
    st = stateMap2[i].replace('t', '')
    
    v = eval(st)
    minError = min([-i+e for i, e in v])
    c = len(v)
    sum += c
    minErrors.append(minError)
  w('')

  w.indent()
  #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors]))

  w.outdent()

  w('')
  w('  public %s(int w) {' % className)
  w('    super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1)
  w('  }')

  if 0:
    w('')
    w('@Override')
    w('public int size() { // this can now move up?')
    w('  return %d*(w+1);' % (len(stateMap2)-1))
    w('}')

    w('')
    w('@Override')
    w('public int getPosition(int absState) { // this can now move up?')
    w('  return absState % (w+1);')
    w('}')

    w('')
    w('@Override')
    w('public boolean isAccept(int absState) { // this can now move up?')
    w('  // decode absState -> state, offset')
    w('  int state = absState/(w+1);')
    w('  if (true || state < minErrors.length) {')
    w('    int offset = absState%(w+1);')
    w('    assert offset >= 0;')
    w('    return w - offset + minErrors[state] <= %d;' % n)
    w('  } else {')
    w('    return false;')
    w('  }')
    w('}')

  if MODE == 'array' and PACKED:

    # we moved into super class
    if False:
      w('')

      v = 2
      l = []
      for i in range(63):
        l.append(hex(v-1))
        v *= 2

      w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1)
      w('')

      # unpack in java
      w('private int unpack(long[] data, int index, int bitsPerValue) {')
      w('  final long bitLoc = bitsPerValue * index;')
      w('  final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD)
      w('  final int bitStart = (int) (bitLoc & %d);' % (WORD-1))
      w('  //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);')
      w('  if (bitStart + bitsPerValue <= %d) {' % WORD)
      w('    // not split')
      w('    return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);')
      w('  } else {')
      w('    // split')
      w('    final int part = %d-bitStart;' % WORD)
      w('    return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +')
      w('      ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1)
      w('  }')
      w('}')
  
  # class
  w('}')
  w('')

  fileOut = '%s.java' % className

  s = str(w)
  for sub, repl in subs:
    s = s.replace(sub, repl)

  open(fileOut, 'wb').write(s)

  print 'Wrote %s [%d lines; %.1f KB]' % \
        (fileOut, len(w.l), os.path.getsize(fileOut)/1024.)