def transitions(distance) : transition_states = possibleStates.genTransitions(distance) new_states = {} for i, states in enumerate(transition_states) : cv_dict = {} for cv, state_type in states.iteritems() : cv_s = bitShift(eval(cv)) for k, v in state_type.items() : if k in cv_dict : cv_dict[k].update({cv_s : v}) else : cv_dict[k] = {cv_s : v} new_states[i] = cv_dict return new_states
def main(): if len(sys.argv) != 3: print print 'Usage: python -u %s N <True/False>' % sys.argv[0] print print 'NOTE: the resulting .java file is created in the current working dir!' print sys.exit(1) n = int(sys.argv[1]) transpose = (sys.argv[2] == "True") tables = genTransitions(n, transpose) stateMap = {} # init null state stateMap['[]'] = -1 # init start state stateMap['[(0, 0)]'] = 0 w = LineOutput() w('package org.apache.lucene.util.automaton;') w('') w('/*') w(' * Licensed to the Apache Software Foundation (ASF) under one or more') w(' * contributor license agreements. See the NOTICE file distributed with') w(' * this work for additional information regarding copyright ownership.') w(' * The ASF licenses this file to You under the Apache License, Version 2.0') w(' * (the "License"); you may not use this file except in compliance with') w(' * the License. You may obtain a copy of the License at') w(' *') w(' * http://www.apache.org/licenses/LICENSE-2.0') w(' *') w(' * Unless required by applicable law or agreed to in writing, software') w(' * distributed under the License is distributed on an "AS IS" BASIS,') w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') w(' * See the License for the specific language governing permissions and') w(' * limitations under the License.') w(' */') w('') w('// The following code was generated with the moman/finenight pkg') w('// This package is available under the MIT License, see NOTICE.txt') w('// for more details.') w('') w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') w('') if transpose: w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) w(' with transpositions as primitive edits */') className = 'Lev%dTParametricDescription' % n else: w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) className = 'Lev%dParametricDescription' % n w('class %s extends ParametricDescription {' % className) w('') w('@Override') w('int transition(int absState, int position, int vector) {') w(' // null absState should never be passed in') w(' assert absState != -1;') w('') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w('') machines = [] for i, map in enumerate(tables): if i == 0: w('if (position == w) {') elif i == len(tables)-1: w('} else {') else: w('} else if (position == w-%d) {' % i) if i != 0 and MODE == 'switch': w('switch(vector) {') l = map.items() l.sort() numCasesPerVector = None numVectors = len(l) if MODE == 'array': toStateArray = [] toOffsetIncrArray = [] for charVar, states in l: # somehow it's a string: charVar = eval(charVar) if i != 0 and MODE == 'switch': w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) w.indent() l = states.items() byFromState = {} # first pass to assign states byAction = {} for s, (toS, offset) in l: state = str(s) toState = str(toS) if state not in stateMap: stateMap[state] = len(stateMap)-1 if toState not in stateMap: stateMap[toState] = len(stateMap)-1 byFromState[stateMap[state]] = (1+stateMap[toState], offset) fromStateDesc = s[1:len(s)-1] toStateDesc = ', '.join([str(x) for x in toS]) tup = (stateMap[toState], toStateDesc, offset) if tup not in byAction: byAction[tup] = [] byAction[tup].append((fromStateDesc, stateMap[state])) if numCasesPerVector is None: numCasesPerVector = len(l) else: # we require this to be uniform... empirically it seems to be! assert numCasesPerVector == len(l) if MODE == 'array': for s in range(numCasesPerVector): toState, offsetIncr = byFromState[s] toStateArray.append(toState) toOffsetIncrArray.append(offsetIncr) else: # render switches w('switch(state) { // %s cases' % len(l)) for (toState, toStateDesc, offset), lx in byAction.items(): for fromStateDesc, fromState in lx: w('case %s: // %s' % (fromState, fromStateDesc)) w.indent() w(' state = %s; // %s' % (toState, toStateDesc)) if offset > 0: w(' offset += %s;' % offset) w('break;') w.outdent() w('}') if i != 0: w('break;') w.outdent() if MODE == 'array': # strangely state can come in wildly out of bounds.... w(' if (state < %d) {' % numCasesPerVector) w(' final int loc = vector * %d + state;' % numCasesPerVector) if PACKED: w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) else: w(' offset += offsetIncrs%d[loc];' % i) w(' state = toStates%d[loc]-1;' % i) w(' }') elif i != 0: w('}') machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) # ends switch statement for machine w('}') w('') w(' if (state == -1) {') w(' // null state') w(' return -1;') w(' } else {') w(' // translate back to abs') w(' return state*(w+1)+offset;') w(' }') # ends transition method w('}') subs = [] if MODE == 'array': w.indent() for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): w('') w.outdent() w('// %d vectors; %d states per vector; array length = %d' % \ (numVectors, numCasesPerVector, numVectors*numCasesPerVector)) w.indent() if PACKED: # pack in python l, nbits = pack(toStateArray) subs.append(('NBITSSTATES%d' % i, str(nbits))) w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([hex(long(x)) for x in l]))) l, nbits = pack(toOffsetIncrsArray) subs.append(('NBITSOFFSET%d' % i, str(nbits))) w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([hex(long(x)) for x in l]))) else: w(' private final static int[] toStates%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w.outdent() stateMap2 = dict([[v,k] for k,v in stateMap.items()]) w('') w('// state map') sum = 0 minErrors = [] for i in xrange(len(stateMap2)-1): w('// %s -> %s' % (i, stateMap2[i])) # we replace t-notation as its not relevant here st = stateMap2[i].replace('t', '') v = eval(st) minError = min([-i+e for i, e in v]) c = len(v) sum += c minErrors.append(minError) w('') w.indent() #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) w.outdent() w('') w(' public %s(int w) {' % className) w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) w(' }') if 0: w('') w('@Override') w('public int size() { // this can now move up?') w(' return %d*(w+1);' % (len(stateMap2)-1)) w('}') w('') w('@Override') w('public int getPosition(int absState) { // this can now move up?') w(' return absState % (w+1);') w('}') w('') w('@Override') w('public boolean isAccept(int absState) { // this can now move up?') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' if (true || state < minErrors.length) {') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w(' return w - offset + minErrors[state] <= %d;' % n) w(' } else {') w(' return false;') w(' }') w('}') if MODE == 'array' and PACKED: # we moved into super class if False: w('') v = 2 l = [] for i in range(63): l.append(hex(v-1)) v *= 2 w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) w('') # unpack in java w('private int unpack(long[] data, int index, int bitsPerValue) {') w(' final long bitLoc = bitsPerValue * index;') w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') w(' if (bitStart + bitsPerValue <= %d) {' % WORD) w(' // not split') w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') w(' } else {') w(' // split') w(' final int part = %d-bitStart;' % WORD) w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) w(' }') w('}') # class w('}') w('') fileOut = '%s.java' % className s = str(w) for sub, repl in subs: s = s.replace(sub, repl) open(fileOut, 'wb').write(s) print 'Wrote %s [%d lines; %.1f KB]' % \ (fileOut, len(w.l), os.path.getsize(fileOut)/1024.)
def main(): if len(sys.argv) != 3: print print 'Usage: python -u %s N <True/False>' % sys.argv[0] print print 'NOTE: the resulting .java file is created in the current working dir!' print sys.exit(1) n = int(sys.argv[1]) transpose = (sys.argv[2] == "True") tables = genTransitions(n, transpose) stateMap = {} # init null state stateMap['[]'] = -1 # init start state stateMap['[(0, 0)]'] = 0 w = LineOutput() w('package org.apache.lucene.util.automaton;') w('') w('/*') w(' * Licensed to the Apache Software Foundation (ASF) under one or more') w(' * contributor license agreements. See the NOTICE file distributed with') w(' * this work for additional information regarding copyright ownership.') w(' * The ASF licenses this file to You under the Apache License, Version 2.0') w(' * (the "License"); you may not use this file except in compliance with') w(' * the License. You may obtain a copy of the License at') w(' *') w(' * http://www.apache.org/licenses/LICENSE-2.0') w(' *') w(' * Unless required by applicable law or agreed to in writing, software') w(' * distributed under the License is distributed on an "AS IS" BASIS,') w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') w(' * See the License for the specific language governing permissions and') w(' * limitations under the License.') w(' */') w('') w('// The following code was generated with the moman/finenight pkg') w('// This package is available under the MIT License, see NOTICE.txt') w('// for more details.') w('') w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') w('') if transpose: w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) w(' with transpositions as primitive edits */') className = 'Lev%dTParametricDescription' % n else: w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) className = 'Lev%dParametricDescription' % n w('class %s extends ParametricDescription {' % className) w('') w('@Override') w('int transition(int absState, int position, int vector) {') w(' // null absState should never be passed in') w(' assert absState != -1;') w('') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w('') machines = [] for i, map in enumerate(tables): if i == 0: w('if (position == w) {') elif i == len(tables)-1: w('} else {') else: w('} else if (position == w-%d) {' % i) if i != 0 and MODE == 'switch': w('switch(vector) {') l = map.items() l.sort() numCasesPerVector = None numVectors = len(l) if MODE == 'array': toStateArray = [] toOffsetIncrArray = [] for charVar, states in l: # somehow it's a string: charVar = eval(charVar) if i != 0 and MODE == 'switch': w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) w.indent() l = states.items() byFromState = {} # first pass to assign states byAction = {} for s, (toS, offset) in l: state = str(s) toState = str(toS) if state not in stateMap: stateMap[state] = len(stateMap)-1 if toState not in stateMap: stateMap[toState] = len(stateMap)-1 byFromState[stateMap[state]] = (1+stateMap[toState], offset) fromStateDesc = s[1:len(s)-1] toStateDesc = ', '.join([str(x) for x in toS]) tup = (stateMap[toState], toStateDesc, offset) if tup not in byAction: byAction[tup] = [] byAction[tup].append((fromStateDesc, stateMap[state])) if numCasesPerVector is None: numCasesPerVector = len(l) else: # we require this to be uniform... empirically it seems to be! assert numCasesPerVector == len(l) if MODE == 'array': for s in range(numCasesPerVector): toState, offsetIncr = byFromState[s] toStateArray.append(toState) toOffsetIncrArray.append(offsetIncr) else: # render switches w('switch(state) { // %s cases' % len(l)) for (toState, toStateDesc, offset), lx in byAction.items(): for fromStateDesc, fromState in lx: w('case %s: // %s' % (fromState, fromStateDesc)) w.indent() w(' state = %s; // %s' % (toState, toStateDesc)) if offset > 0: w(' offset += %s;' % offset) w('break;') w.outdent() w('}') if i != 0: w('break;') w.outdent() if MODE == 'array': # strangely state can come in wildly out of bounds.... w(' if (state < %d) {' % numCasesPerVector) w(' final int loc = vector * %d + state;' % numCasesPerVector) if PACKED: w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) else: w(' offset += offsetIncrs%d[loc];' % i) w(' state = toStates%d[loc]-1;' % i) w(' }') elif i != 0: w('}') machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) # ends switch statement for machine w('}') w('') w(' if (state == -1) {') w(' // null state') w(' return -1;') w(' } else {') w(' // translate back to abs') w(' return state*(w+1)+offset;') w(' }') # ends transition method w('}') subs = [] if MODE == 'array': w.indent() for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): w('') w.outdent() w('// %d vectors; %d states per vector; array length = %d' % \ (numVectors, numCasesPerVector, numVectors*numCasesPerVector)) w.indent() if PACKED: # pack in python l, nbits = pack(toStateArray) subs.append(('NBITSSTATES%d' % i, str(nbits))) w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([hex(long(x)) for x in l]))) l, nbits = pack(toOffsetIncrsArray) subs.append(('NBITSOFFSET%d' % i, str(nbits))) w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([hex(long(x)) for x in l]))) else: w(' private final static int[] toStates%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w.outdent() stateMap2 = dict([[v,k] for k,v in stateMap.items()]) w('') w('// state map') sum = 0 minErrors = [] for i in xrange(len(stateMap2)-1): w('// %s -> %s' % (i, stateMap2[i])) # we replace t-notation as it's not relevant here st = stateMap2[i].replace('t', '') v = eval(st) minError = min([-i+e for i, e in v]) c = len(v) sum += c minErrors.append(minError) w('') w.indent() #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) w.outdent() w('') w(' public %s(int w) {' % className) w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) w(' }') if 0: w('') w('@Override') w('public int size() { // this can now move up?') w(' return %d*(w+1);' % (len(stateMap2)-1)) w('}') w('') w('@Override') w('public int getPosition(int absState) { // this can now move up?') w(' return absState % (w+1);') w('}') w('') w('@Override') w('public boolean isAccept(int absState) { // this can now move up?') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' if (true || state < minErrors.length) {') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w(' return w - offset + minErrors[state] <= %d;' % n) w(' } else {') w(' return false;') w(' }') w('}') if MODE == 'array' and PACKED: # we moved into super class if False: w('') v = 2 l = [] for i in range(63): l.append(hex(v-1)) v *= 2 w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) w('') # unpack in java w('private int unpack(long[] data, int index, int bitsPerValue) {') w(' final long bitLoc = bitsPerValue * index;') w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') w(' if (bitStart + bitsPerValue <= %d) {' % WORD) w(' // not split') w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') w(' } else {') w(' // split') w(' final int part = %d-bitStart;' % WORD) w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) w(' }') w('}') # class w('}') w('') fileOut = '%s.java' % className s = str(w) for sub, repl in subs: s = s.replace(sub, repl) open(fileOut, 'wb').write(s) print 'Wrote %s [%d lines; %.1f KB]' % \ (fileOut, len(w.l), os.path.getsize(fileOut)/1024.)