def testFieldMetaTypeIsValid(self): self.assertEqual(FieldMetaType.isValid(FieldMetaType.string), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.datetime), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.integer), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.float), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.boolean), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.list), True) self.assertEqual(FieldMetaType.isValid(FieldMetaType.sdr), True) self.assertEqual(FieldMetaType.isValid("bogus-type"), False)
def __init__(self, streamDef, bookmark=None, saveOutput=False, isBlocking=True, maxTimeout=0, eofOnTimeout=False): """ Base class constructor, performs common initialization Parameters: ---------------------------------------------------------------- streamDef: The stream definition, potentially containing multiple sources (not supported yet). See /nupic/frameworks/opf/jsonschema/stream_def.json for the format of this dict bookmark: Bookmark to start reading from. This overrides the first_record field of the streamDef if provided. saveOutput: If true, save the output to a csv file in a temp directory. The path to the generated file can be found in the log output. isBlocking: should read operation block *forever* if the next row of data is not available, but the stream is not marked as 'completed' yet? maxTimeout: if isBlocking is False, max seconds to wait for more data before timing out; ignored when isBlocking is True. eofOnTimeout: If True and we get a read timeout (isBlocking must be False to get read timeouts), assume we've reached the end of the input and produce the last aggregated record, if one can be completed. """ # Call superclass constructor super(StreamReader, self).__init__() loggerPrefix = 'com.numenta.nupic.data.StreamReader' self._logger = logging.getLogger(loggerPrefix) jsonhelpers.validate(streamDef, schemaPath=pkg_resources.resource_filename( jsonschema.__name__, "stream_def.json")) assert len(streamDef['streams']) == 1, "Only 1 source stream is supported" # Save constructor args sourceDict = streamDef['streams'][0] self._recordCount = 0 self._eofOnTimeout = eofOnTimeout self._logger.debug('Reading stream with the def: %s', sourceDict) # Dictionary to store record statistics (min and max of scalars for now) self._stats = None # --------------------------------------------------------------------- # Get the stream definition params # Limiting window of the stream. It would not return any records until # 'first_record' ID is read (or very first with the ID above that). The # stream will return EOS once it reads record with ID 'last_record' or # above (NOTE: the name 'lastRecord' is misleading because it is NOT # inclusive). firstRecordIdx = sourceDict.get('first_record', None) self._sourceLastRecordIdx = sourceDict.get('last_record', None) # If a bookmark was given, then override first_record from the stream # definition. if bookmark is not None: firstRecordIdx = None # Column names must be provided in the streamdef json # Special case is ['*'], meaning all available names from the record stream self._streamFieldNames = sourceDict.get('columns', None) if self._streamFieldNames != None and self._streamFieldNames[0] == '*': self._needFieldsFiltering = False else: self._needFieldsFiltering = True # Types must be specified in streamdef json, or in case of the # file_recod_stream types could be implicit from the file streamFieldTypes = sourceDict.get('types', None) self._logger.debug('Types from the def: %s', streamFieldTypes) # Validate that all types are valid if streamFieldTypes is not None: for dataType in streamFieldTypes: assert FieldMetaType.isValid(dataType) # Reset, sequence and time fields might be provided by streamdef json streamResetFieldName = streamDef.get('resetField', None) streamTimeFieldName = streamDef.get('timeField', None) streamSequenceFieldName = streamDef.get('sequenceIdField', None) self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName, streamTimeFieldName, streamSequenceFieldName) # ======================================================================= # Open up the underlying record store dataUrl = sourceDict.get('source', None) assert dataUrl is not None self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx) assert self._recordStore is not None # ======================================================================= # Prepare the data structures we need for returning just the fields # the caller wants from each record recordStoreFields = self._recordStore.getFields() self._recordStoreFieldNames = self._recordStore.getFieldNames() if not self._needFieldsFiltering: self._streamFieldNames = self._recordStoreFieldNames # Build up the field definitions for each field. This is a list of tuples # of (name, type, special) self._streamFields = [] for dstIdx, name in enumerate(self._streamFieldNames): if name not in self._recordStoreFieldNames: raise RuntimeError("The column '%s' from the stream definition " "is not present in the underlying stream which has the following " "columns: %s" % (name, self._recordStoreFieldNames)) fieldIdx = self._recordStoreFieldNames.index(name) fieldType = recordStoreFields[fieldIdx].type fieldSpecial = recordStoreFields[fieldIdx].special # If the types or specials were defined in the stream definition, # then override what was found in the record store if streamFieldTypes is not None: fieldType = streamFieldTypes[dstIdx] if streamResetFieldName is not None and streamResetFieldName == name: fieldSpecial = FieldMetaSpecial.reset if streamTimeFieldName is not None and streamTimeFieldName == name: fieldSpecial = FieldMetaSpecial.timestamp if (streamSequenceFieldName is not None and streamSequenceFieldName == name): fieldSpecial = FieldMetaSpecial.sequence self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial)) # ======================================================================== # Create the aggregator which will handle aggregation of records before # returning them. self._aggregator = Aggregator( aggregationInfo=streamDef.get('aggregation', None), inputFields=recordStoreFields, timeFieldName=streamDef.get('timeField', None), sequenceIdFieldName=streamDef.get('sequenceIdField', None), resetFieldName=streamDef.get('resetField', None)) # We rely on the aggregator to tell us the bookmark of the last raw input # that contributed to the aggregated record self._aggBookmark = None # Compute the aggregation period in terms of months and seconds if 'aggregation' in streamDef: self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds( streamDef.get('aggregation')) else: self._aggMonthsAndSeconds = None # ======================================================================== # Are we saving the generated output to a csv? if saveOutput: tmpDir = tempfile.mkdtemp() outFilename = os.path.join(tmpDir, "generated_output.csv") self._logger.info("StreamReader: Saving generated records to: '%s'" % outFilename) self._writer = FileRecordStream(streamID=outFilename, write=True, fields=self._streamFields) else: self._writer = None
def __init__(self, streamDef, bookmark=None, saveOutput=False, isBlocking=True, maxTimeout=0, eofOnTimeout=False): """ Base class constructor, performs common initialization Parameters: ---------------------------------------------------------------- streamDef: The stream definition, potentially containing multiple sources (not supported yet). See /nupic/frameworks/opf/jsonschema/stream_def.json for the format of this dict bookmark: Bookmark to start reading from. This overrides the first_record field of the streamDef if provided. saveOutput: If true, save the output to a csv file in a temp directory. The path to the generated file can be found in the log output. isBlocking: should read operation block *forever* if the next row of data is not available, but the stream is not marked as 'completed' yet? maxTimeout: if isBlocking is False, max seconds to wait for more data before timing out; ignored when isBlocking is True. eofOnTimeout: If True and we get a read timeout (isBlocking must be False to get read timeouts), assume we've reached the end of the input and produce the last aggregated record, if one can be completed. """ # Call superclass constructor super(StreamReader, self).__init__() loggerPrefix = 'com.numenta.nupic.data.StreamReader' self._logger = logging.getLogger(loggerPrefix) jsonhelpers.validate(streamDef, schemaPath=pkg_resources.resource_filename( jsonschema.__name__, "stream_def.json")) assert len( streamDef['streams']) == 1, "Only 1 source stream is supported" # Save constructor args sourceDict = streamDef['streams'][0] self._recordCount = 0 self._eofOnTimeout = eofOnTimeout self._logger.debug('Reading stream with the def: %s', sourceDict) # Dictionary to store record statistics (min and max of scalars for now) self._stats = None # --------------------------------------------------------------------- # Get the stream definition params # Limiting window of the stream. It would not return any records until # 'first_record' ID is read (or very first with the ID above that). The # stream will return EOS once it reads record with ID 'last_record' or # above (NOTE: the name 'lastRecord' is misleading because it is NOT # inclusive). firstRecordIdx = sourceDict.get('first_record', None) self._sourceLastRecordIdx = sourceDict.get('last_record', None) # If a bookmark was given, then override first_record from the stream # definition. if bookmark is not None: firstRecordIdx = None # Column names must be provided in the streamdef json # Special case is ['*'], meaning all available names from the record stream self._streamFieldNames = sourceDict.get('columns', None) if self._streamFieldNames != None and self._streamFieldNames[0] == '*': self._needFieldsFiltering = False else: self._needFieldsFiltering = True # Types must be specified in streamdef json, or in case of the # file_recod_stream types could be implicit from the file streamFieldTypes = sourceDict.get('types', None) self._logger.debug('Types from the def: %s', streamFieldTypes) # Validate that all types are valid if streamFieldTypes is not None: for dataType in streamFieldTypes: assert FieldMetaType.isValid(dataType) # Reset, sequence and time fields might be provided by streamdef json streamResetFieldName = streamDef.get('resetField', None) streamTimeFieldName = streamDef.get('timeField', None) streamSequenceFieldName = streamDef.get('sequenceIdField', None) self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName, streamTimeFieldName, streamSequenceFieldName) # ======================================================================= # Open up the underlying record store dataUrl = sourceDict.get('source', None) assert dataUrl is not None self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx) assert self._recordStore is not None # ======================================================================= # Prepare the data structures we need for returning just the fields # the caller wants from each record recordStoreFields = self._recordStore.getFields() self._recordStoreFieldNames = self._recordStore.getFieldNames() if not self._needFieldsFiltering: self._streamFieldNames = self._recordStoreFieldNames # Build up the field definitions for each field. This is a list of tuples # of (name, type, special) self._streamFields = [] for dstIdx, name in enumerate(self._streamFieldNames): if name not in self._recordStoreFieldNames: raise RuntimeError( "The column '%s' from the stream definition " "is not present in the underlying stream which has the following " "columns: %s" % (name, self._recordStoreFieldNames)) fieldIdx = self._recordStoreFieldNames.index(name) fieldType = recordStoreFields[fieldIdx].type fieldSpecial = recordStoreFields[fieldIdx].special # If the types or specials were defined in the stream definition, # then override what was found in the record store if streamFieldTypes is not None: fieldType = streamFieldTypes[dstIdx] if streamResetFieldName is not None and streamResetFieldName == name: fieldSpecial = FieldMetaSpecial.reset if streamTimeFieldName is not None and streamTimeFieldName == name: fieldSpecial = FieldMetaSpecial.timestamp if (streamSequenceFieldName is not None and streamSequenceFieldName == name): fieldSpecial = FieldMetaSpecial.sequence self._streamFields.append( FieldMetaInfo(name, fieldType, fieldSpecial)) # ======================================================================== # Create the aggregator which will handle aggregation of records before # returning them. self._aggregator = Aggregator( aggregationInfo=streamDef.get('aggregation', None), inputFields=recordStoreFields, timeFieldName=streamDef.get('timeField', None), sequenceIdFieldName=streamDef.get('sequenceIdField', None), resetFieldName=streamDef.get('resetField', None)) # We rely on the aggregator to tell us the bookmark of the last raw input # that contributed to the aggregated record self._aggBookmark = None # Compute the aggregation period in terms of months and seconds if 'aggregation' in streamDef: self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds( streamDef.get('aggregation')) else: self._aggMonthsAndSeconds = None # ======================================================================== # Are we saving the generated output to a csv? if saveOutput: tmpDir = tempfile.mkdtemp() outFilename = os.path.join(tmpDir, "generated_output.csv") self._logger.info( "StreamReader: Saving generated records to: '%s'" % outFilename) self._writer = FileRecordStream(streamID=outFilename, write=True, fields=self._streamFields) else: self._writer = None
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials=[""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception('Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % (self._filename, t,)) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception('Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = {FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList} else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = {FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList} self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): """ streamID: CSV file name, input or output write: True or False, open for writing if True fields: a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only applicable when write==True missingValues: what missing values should be replaced with? bookmark: a reference to the previous reader, if passed in, the records will be returned starting from the point where bookmark was requested. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. includeMS: If false, the microseconds portion is not included in the generated output file timestamp fields. This makes it compatible with reading in from Excel. firstRecord: 0-based index of the first record to start reading from. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none) The name is the name of the field. The type is one of the constants in `FieldMetaType`. The special is one of the `FieldMetaSpecial` values that designate their field as the sequenceId, reset, timestamp, or category. With exception of multiple categories, there can be at most one of each. There may be multiple fields of type datetime, but no more than one of them may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id field must be either a string or an int. The reset field must be an int (and must contain 0 or 1). The category field must be an int or space-separated list of ints, where the former represents single-label classification and the latter is for multi-label classification (e.g. "1 3 4" designates a record for labels 1, 3, and 4). The number of categories is allowed to vary record to record; sensor regions represent non-categories with -1, thus the category values must be >= 0. The FileRecordStream iterates over the field names, types and specials and stores the information. """ super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Make sure readline() works on windows too os.linesep = '\n' # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials=[""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception('Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % (self._filename, t,)) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception('Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = {FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList} else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = {FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList} self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): """ streamID: CSV file name, input or output write: True or False, open for writing if True fields: a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only applicable when write==True missingValues: what missing values should be replaced with? bookmark: a reference to the previous reader, if passed in, the records will be returned starting from the point where bookmark was requested. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. includeMS: If false, the microseconds portion is not included in the generated output file timestamp fields. This makes it compatible with reading in from Excel. firstRecord: 0-based index of the first record to start reading from. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none) The name is the name of the field. The type is one of the constants in `FieldMetaType`. The special is one of the `FieldMetaSpecial` values that designate their field as the sequenceId, reset, timestamp, or category. With exception of multiple categories, there can be at most one of each. There may be multiple fields of type datetime, but no more than one of them may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id field must be either a string or an int. The reset field must be an int (and must contain 0 or 1). The category field must be an int or space-separated list of ints, where the former represents single-label classification and the latter is for multi-label classification (e.g. "1 3 4" designates a record for labels 1, 3, and 4). The number of categories is allowed to vary record to record; sensor regions represent non-categories with -1, thus the category values must be >= 0. The FileRecordStream iterates over the field names, types and specials and stores the information. """ super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all( isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials = [""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception( 'Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % ( self._filename, t, )) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception( 'Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [ FieldMetaInfo(*attrs) for attrs in zip(names, types, specials) ] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = { FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList } else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = { FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList } self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None