class CategoryEncoder(Encoder): """Encodes a list of discrete categories (described by strings), that aren't related to each other, so we never emit a mixture of categories. The value of zero is reserved for "unknown category" Internally we use a ScalarEncoder with a radius of 1, but since we only encode integers, we never get mixture outputs. The SDRCategoryEncoder uses a different method to encode categories""" ############################################################################ def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i+1 self.indexToCategory[i+1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None ############################################################################ def getDecoderOutputFieldTypes(self): """ [Encoder class virtual method override] """ # TODO: change back to string meta-type after the decoding logic is fixed # to output strings instead of internal index values. #return (FieldMetaType.string,) return (FieldMetaType.integer,) ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def getScalars(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) else: return numpy.array([self.categoryToIndex.get(input, 0)]) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ # Get the bucket index from the underlying scalar encoder if input == SENTINEL_VALUE_FOR_MISSING_DATA: return [None] else: return self.encoder.getBucketIndices(self.categoryToIndex.get(input, 0)) ############################################################################ def encodeIntoArray(self, input, output): # if not found, we encode category 0 if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 val = "<missing>" else: val = self.categoryToIndex.get(input, 0) self.encoder.encodeIntoArray(val, output) if self.verbosity >= 2: print "input:", input, "va:", val, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert(len(fieldsDict) == 1) # Get the list of categories the scalar values correspond to and # generate the description from the category name(s). (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] desc = "" for (minV, maxV) in inRanges: minV = int(round(minV)) maxV = int(round(maxV)) outRanges.append((minV, maxV)) while minV <= maxV: if len(desc) > 0: desc += ", " desc += self.indexToCategory[minV] minV += 1 # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True,): """ See the function description in base.py kwargs will have the keyword "fractional", which is ignored by this encoder """ expValue = expValues[0] actValue = actValues[0] if expValue == actValue: closeness = 1.0 else: closeness = 0.0 if not fractional: closeness = 1.0 - closeness #print "category::", "expValue:", expValue, "actValue:", actValue, \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ if self._bucketValues is None: numBuckets = len(self.encoder.getBucketValues()) self._bucketValues = [] for bucketIndex in range(numBuckets): self._bucketValues.append(self.getBucketInfo([bucketIndex])[0].value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ # For the category encoder, the bucket index is the category index bucketInfo = self.encoder.getBucketInfo(buckets)[0] categoryIndex = int(round(bucketInfo.value)) category = self.indexToCategory[categoryIndex] return [EncoderResult(value=category, scalar=categoryIndex, encoding=bucketInfo.encoding)] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ encoderResult = self.encoder.topDownCompute(encoded)[0] value = encoderResult.value categoryIndex = int(round(value)) category = self.indexToCategory[categoryIndex] return EncoderResult(value=category, scalar=categoryIndex, encoding=encoderResult.encoding)
class CategoryEncoder(Encoder): """Encodes a list of discrete categories (described by strings), that aren't related to each other, so we never emit a mixture of categories. The value of zero is reserved for "unknown category" Internally we use a ScalarEncoder with a radius of 1, but since we only encode integers, we never get mixture outputs. The SDRCategoryEncoder uses a different method to encode categories""" ############################################################################ def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i + 1 self.indexToCategory[i + 1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None ############################################################################ def getDecoderOutputFieldTypes(self): """ [Encoder class virtual method override] """ # TODO: change back to string meta-type after the decoding logic is fixed # to output strings instead of internal index values. #return (FieldMetaType.string,) return (FieldMetaType.integer, ) ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def getScalars(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) else: return numpy.array([self.categoryToIndex.get(input, 0)]) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ # Get the bucket index from the underlying scalar encoder if input == SENTINEL_VALUE_FOR_MISSING_DATA: return [None] else: return self.encoder.getBucketIndices( self.categoryToIndex.get(input, 0)) ############################################################################ def encodeIntoArray(self, input, output): # if not found, we encode category 0 if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 val = "<missing>" else: val = self.categoryToIndex.get(input, 0) self.encoder.encodeIntoArray(val, output) if self.verbosity >= 2: print "input:", input, "va:", val, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert (len(fieldsDict) == 1) # Get the list of categories the scalar values correspond to and # generate the description from the category name(s). (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] desc = "" for (minV, maxV) in inRanges: minV = int(round(minV)) maxV = int(round(maxV)) outRanges.append((minV, maxV)) while minV <= maxV: if len(desc) > 0: desc += ", " desc += self.indexToCategory[minV] minV += 1 # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def closenessScores( self, expValues, actValues, fractional=True, ): """ See the function description in base.py kwargs will have the keyword "fractional", which is ignored by this encoder """ expValue = expValues[0] actValue = actValues[0] if expValue == actValue: closeness = 1.0 else: closeness = 0.0 if not fractional: closeness = 1.0 - closeness #print "category::", "expValue:", expValue, "actValue:", actValue, \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ if self._bucketValues is None: numBuckets = len(self.encoder.getBucketValues()) self._bucketValues = [] for bucketIndex in range(numBuckets): self._bucketValues.append( self.getBucketInfo([bucketIndex])[0].value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ # For the category encoder, the bucket index is the category index bucketInfo = self.encoder.getBucketInfo(buckets)[0] categoryIndex = int(round(bucketInfo.value)) category = self.indexToCategory[categoryIndex] return [ EncoderResult(value=category, scalar=categoryIndex, encoding=bucketInfo.encoding) ] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ encoderResult = self.encoder.topDownCompute(encoded)[0] value = encoderResult.value categoryIndex = int(round(value)) category = self.indexToCategory[categoryIndex] return EncoderResult(value=category, scalar=categoryIndex, encoding=encoderResult.encoding)
class LogEncoder(Encoder): """A Log encoder represents a floating point value on a logarithmic (decibel) scale. valueToEncode = 10 * log10(input) The default resolution (minimum difference in scaled values which is guaranteed to propduce different outputs) is 1 decibel. For example, the scaled values 10 and 11 will be distinguishable in the output. In terms of the original input values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable. resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel minval -- must be greater than 0. Lower values are reset to this value maxval -- Higher values are reset to this value """ def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10 ** (self.minScaledValue / 10.0) self.maxval = 10 ** (self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder( w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution) ) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def _getScaledValue(self, input): """ Convert the input, which is in normal space, into log space """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return None else: val = input if val < self.minval: val = self.minval elif val > self.maxval: val = self.maxval scaledVal = 10 * math.log10(val) return scaledVal ############################################################################ def getBucketIndices(self, input): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: return [None] else: return self.encoder.getBucketIndices(scaledVal) ############################################################################ def encodeIntoArray(self, input, output): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: output[0:] = 0 else: self.encoder.encodeIntoArray(scaledVal, output) if self.verbosity >= 2: print "input:", input, "scaledVal:", scaledVal, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=""): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert len(fieldsDict) == 1 # Convert each range into normal space (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] for (minV, maxV) in inRanges: outRanges.append((math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0))) # Generate a text description of the ranges desc = "" numRanges = len(outRanges) for i in xrange(numRanges): if outRanges[i][0] != outRanges[i][1]: desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1]) else: desc += "%.2f" % (outRanges[i][0]) if i < numRanges - 1: desc += ", " # Return result if parentFieldName != "": fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ # Need to re-create? if self._bucketValues is None: scaledValues = self.encoder.getBucketValues() self._bucketValues = [] for scaledValue in scaledValues: value = math.pow(10, scaledValue / 10.0) self._bucketValues.append(value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ scaledResult = self.encoder.getBucketInfo(buckets)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return [EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding)] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ scaledResult = self.encoder.topDownCompute(encoded)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True): """ See the function description in base.py """ # Compute the percent error in log space if expValues[0] > 0: expValue = 10 * math.log10(expValues[0]) else: expValue = self.minScaledValue if actValues[0] > 0: actValue = 10 * math.log10(actValues[0]) else: actValue = self.minScaledValue if fractional: err = abs(expValue - actValue) pctErr = err / (self.maxScaledValue - self.minScaledValue) pctErr = min(1.0, pctErr) closeness = 1.0 - pctErr else: err = abs(expValue - actValue) closeness = err # print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \ # "closeness", closeness # import pdb; pdb.set_trace() return numpy.array([closeness])
class LogEncoder(Encoder): """A Log encoder represents a floating point value on a logarithmic (decibel) scale. valueToEncode = 10 * log10(input) The default resolution (minimum difference in scaled values which is guaranteed to propduce different outputs) is 1 decibel. For example, the scaled values 10 and 11 will be distinguishable in the output. In terms of the original input values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable. resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel minval -- must be greater than 0. Lower values are reset to this value maxval -- Higher values are reset to this value """ def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10**(self.minScaledValue / 10.0) self.maxval = 10**(self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder(w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution)) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def _getScaledValue(self, input): """ Convert the input, which is in normal space, into log space """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return None else: val = input if val < self.minval: val = self.minval elif val > self.maxval: val = self.maxval scaledVal = 10 * math.log10(val) return scaledVal ############################################################################ def getBucketIndices(self, input): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: return [None] else: return self.encoder.getBucketIndices(scaledVal) ############################################################################ def encodeIntoArray(self, input, output): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: output[0:] = 0 else: self.encoder.encodeIntoArray(scaledVal, output) if self.verbosity >= 2: print "input:", input, "scaledVal:", scaledVal, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert (len(fieldsDict) == 1) # Convert each range into normal space (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] for (minV, maxV) in inRanges: outRanges.append( (math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0))) # Generate a text description of the ranges desc = "" numRanges = len(outRanges) for i in xrange(numRanges): if outRanges[i][0] != outRanges[i][1]: desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1]) else: desc += "%.2f" % (outRanges[i][0]) if i < numRanges - 1: desc += ", " # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ # Need to re-create? if self._bucketValues is None: scaledValues = self.encoder.getBucketValues() self._bucketValues = [] for scaledValue in scaledValues: value = math.pow(10, scaledValue / 10.0) self._bucketValues.append(value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ scaledResult = self.encoder.getBucketInfo(buckets)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return [ EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ scaledResult = self.encoder.topDownCompute(encoded)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True): """ See the function description in base.py """ # Compute the percent error in log space if expValues[0] > 0: expValue = 10 * math.log10(expValues[0]) else: expValue = self.minScaledValue if actValues[0] > 0: actValue = 10 * math.log10(actValues[0]) else: actValue = self.minScaledValue if fractional: err = abs(expValue - actValue) pctErr = err / (self.maxScaledValue - self.minScaledValue) pctErr = min(1.0, pctErr) closeness = 1.0 - pctErr else: err = abs(expValue - actValue) closeness = err #print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness])