def testFieldMetaTypeIsValid(self): self.assertEqual(FieldMetaType.isValid(FieldMetaType.string), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.datetime), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.integer), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.float), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.boolean), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.list), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.sdr), True) self.assertEqual(FieldMetaType.isValid("bogus-type"), False)
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all( isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials = [""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception( 'Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % ( self._filename, t, )) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception( 'Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [ FieldMetaInfo(*attrs) for attrs in zip(names, types, specials) ] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = { FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList } else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = { FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList } self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
def __init__(self, streamDef, bookmark=None, saveOutput=False, isBlocking=True, maxTimeout=0, eofOnTimeout=False): # Call superclass constructor super(StreamReader, self).__init__() loggerPrefix = 'com.numenta.nupic.data.StreamReader' self._logger = logging.getLogger(loggerPrefix) json_helpers.validate(streamDef, schemaPath=pkg_resources.resource_filename( jsonschema.__name__, "stream_def.json")) assert len(streamDef['streams']) == 1, "Only 1 source stream is supported" # Save constructor args sourceDict = streamDef['streams'][0] self._recordCount = 0 self._eofOnTimeout = eofOnTimeout self._logger.debug('Reading stream with the def: %s', sourceDict) # Dictionary to store record statistics (min and max of scalars for now) self._stats = None # --------------------------------------------------------------------- # Get the stream definition params # Limiting window of the stream. It would not return any records until # 'first_record' ID is read (or very first with the ID above that). The # stream will return EOS once it reads record with ID 'last_record' or # above (NOTE: the name 'lastRecord' is misleading because it is NOT # inclusive). firstRecordIdx = sourceDict.get('first_record', None) self._sourceLastRecordIdx = sourceDict.get('last_record', None) # If a bookmark was given, then override first_record from the stream # definition. if bookmark is not None: firstRecordIdx = None # Column names must be provided in the streamdef json # Special case is ['*'], meaning all available names from the record stream self._streamFieldNames = sourceDict.get('columns', None) if self._streamFieldNames != None and self._streamFieldNames[0] == '*': self._needFieldsFiltering = False else: self._needFieldsFiltering = True # Types must be specified in streamdef json, or in case of the # file_recod_stream types could be implicit from the file streamFieldTypes = sourceDict.get('types', None) self._logger.debug('Types from the def: %s', streamFieldTypes) # Validate that all types are valid if streamFieldTypes is not None: for dataType in streamFieldTypes: assert FieldMetaType.isValid(dataType) # Reset, sequence and time fields might be provided by streamdef json streamResetFieldName = streamDef.get('resetField', None) streamTimeFieldName = streamDef.get('timeField', None) streamSequenceFieldName = streamDef.get('sequenceIdField', None) self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName, streamTimeFieldName, streamSequenceFieldName) # ======================================================================= # Open up the underlying record store dataUrl = sourceDict.get('source', None) assert dataUrl is not None self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx) assert self._recordStore is not None # ======================================================================= # Prepare the data structures we need for returning just the fields # the caller wants from each record recordStoreFields = self._recordStore.getFields() self._recordStoreFieldNames = self._recordStore.getFieldNames() if not self._needFieldsFiltering: self._streamFieldNames = self._recordStoreFieldNames # Build up the field definitions for each field. This is a list of tuples # of (name, type, special) self._streamFields = [] for dstIdx, name in enumerate(self._streamFieldNames): if name not in self._recordStoreFieldNames: raise RuntimeError("The column '%s' from the stream definition " "is not present in the underlying stream which has the following " "columns: %s" % (name, self._recordStoreFieldNames)) fieldIdx = self._recordStoreFieldNames.index(name) fieldType = recordStoreFields[fieldIdx].type fieldSpecial = recordStoreFields[fieldIdx].special # If the types or specials were defined in the stream definition, # then override what was found in the record store if streamFieldTypes is not None: fieldType = streamFieldTypes[dstIdx] if streamResetFieldName is not None and streamResetFieldName == name: fieldSpecial = FieldMetaSpecial.reset if streamTimeFieldName is not None and streamTimeFieldName == name: fieldSpecial = FieldMetaSpecial.timestamp if (streamSequenceFieldName is not None and streamSequenceFieldName == name): fieldSpecial = FieldMetaSpecial.sequence self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial)) # ======================================================================== # Create the aggregator which will handle aggregation of records before # returning them. self._aggregator = Aggregator( aggregationInfo=streamDef.get('aggregation', None), inputFields=recordStoreFields, timeFieldName=streamDef.get('timeField', None), sequenceIdFieldName=streamDef.get('sequenceIdField', None), resetFieldName=streamDef.get('resetField', None)) # We rely on the aggregator to tell us the bookmark of the last raw input # that contributed to the aggregated record self._aggBookmark = None # Compute the aggregation period in terms of months and seconds if 'aggregation' in streamDef: self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds( streamDef.get('aggregation')) else: self._aggMonthsAndSeconds = None # ======================================================================== # Are we saving the generated output to a csv? if saveOutput: tmpDir = tempfile.mkdtemp() outFilename = os.path.join(tmpDir, "generated_output.csv") self._logger.info("StreamReader: Saving generated records to: '%s'" % outFilename) self._writer = FileRecordStream(streamID=outFilename, write=True, fields=self._streamFields) else: self._writer = None
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials=[""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception('Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % (self._filename, t,)) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception('Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = {FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList} else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = {FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList} self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None