def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (StateKey.POSITION in state_obj and StateKey.METADATA_EXTRACTED in state_obj): raise DatasetParserException( "Provided state is missing position or metadata extracted") self._state = state_obj self._read_state = state_obj # Clear the record buffer self._record_buffer = [] # Need to seek the correct position in the file stream using the read state position. self._stream_handle.seek(self._read_state[StateKey.POSITION]) # make sure we have cleaned the chunker out of old data self._chunker.clean_all_chunks()
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (CtdmoStateKey.POSITION in state_obj): raise DatasetParserException('%s missing in state keys' % CtdmoStateKey.POSITION) if not (CtdmoStateKey.END_CONFIG in state_obj): raise DatasetParserException('%s missing in state keys' % CtdmoStateKey.END_CONFIG) if not (CtdmoStateKey.SERIAL_NUMBER in state_obj): raise DatasetParserException('%s missing in state keys' % CtdmoStateKey.SERIAL_NUMBER) self._record_buffer = [] self._state = state_obj self._read_state = state_obj self.input_file.seek(state_obj[CtdmoStateKey.POSITION])
def set_state(self, state_obj): """ This method will set the state of the MmpCdsParser to a given state @param state_obj the updated state to use """ log.debug("Attempting to set state to: %s", state_obj) # First need to make sure the state type is a dict if not isinstance(state_obj, dict): log.warn("Invalid state structure") raise DatasetParserException("Invalid state structure") # Then we need to make sure that the provided state includes particles returned information if not (StateKey.PARTICLES_RETURNED in state_obj): log.debug(PARTICLES_RETURNED_MISSING_ERROR_MSG) raise DatasetParserException(PARTICLES_RETURNED_MISSING_ERROR_MSG) # Clear out any pre-existing chunks self._chunker.clean_all_chunks() self._record_buffer = [] # Set the state and read state to the provide state self._state = state_obj # Always seek to the beginning of the buffer to read all records self._stream_handle.seek(0)
def set_state(self, state_obj): """ initialize the state """ log.trace("Attempting to set state to: %s", state_obj) if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (StateKey.POSITION in state_obj): raise DatasetParserException("Invalid state keys") self._chunker.clean_all_chunks() self._record_buffer = [] self._state = state_obj self._read_state = state_obj self._stream_handle.seek(state_obj[StateKey.POSITION])
def assert_particles(self, particles, yml_file, resource_path=None): """ Assert that the contents of the particles match those in the results yaml file. @param particles either a DataParticle sub-class or particle dictionary to compare with the particles in the .yml file @param yml_file the .yml file name or full path containing particles to compare @param resource_path the path to the .yml file, used only if yml_file does not contain the full path """ # see if .yml file has the full path if os.path.exists(yml_file): rs_file = yml_file # if not the full path, check if resource path was defined elif resource_path is not None: rs_file = os.path.join(resource_path, yml_file) # out of places to check for the file, raise an error else: raise DatasetParserException( 'Test yaml file cannot be found to assert particles') # initialize result set with this .yml results file rs = ResultSet(rs_file) # compare results particles and assert that the output was successful self.assertTrue(rs.verify(particles), msg=('Failed unit test data validation for file %s' % yml_file))
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (StateKey.POSITION in state_obj): raise DatasetParserException("Invalid state keys") self._chunker.clean_all_chunks() self._record_buffer = [] self._saved_header = None self._state = state_obj self._read_state = state_obj self._stream_handle.seek(state_obj[StateKey.POSITION])
def __init__(self, config, stream_handle, state, state_callback, publish_callback, exception_callback, *args, **kwargs): # # Verify that the required parameters are in the parser configuration. # if not CtdmoStateKey.INDUCTIVE_ID in config: raise DatasetParserException("Parser config is missing %s" % CtdmoStateKey.INDUCTIVE_ID) # # No fancy sieve function needed for this parser. # File is ASCII with records separated by newlines. # super(CtdmoRecoveredCtParser, self).__init__( config, stream_handle, state, partial(StringChunker.regex_sieve_function, regex_list=[REC_CT_RECORD_MATCHER]), state_callback, publish_callback, exception_callback, *args, **kwargs) # # Default the position within the file to the beginning # and set flags to indicate the end of Configuration has not been reached # and the serial number has not been found. # self._read_state = { CtdmoStateKey.POSITION: 0, CtdmoStateKey.END_CONFIG: False, CtdmoStateKey.SERIAL_NUMBER: None } self.input_file = stream_handle if state is not None: self.set_state(state)
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. Should be a list with a StateKey.UNPROCESSED_DATA value, a StateKey.IN_PROCESS_DATA value. The UNPROCESSED_DATA and IN_PROCESS_DATA are both arrays which contain an array of start and end indices for their respective types of data. The timestamp is an NTP4 format timestamp. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException( "Invalid state structure - not a dictionary") # Verify that all required state keys are present. if not ((StateKey.UNPROCESSED_DATA in state_obj) \ and (StateKey.IN_PROCESS_DATA in state_obj) \ and (StateKey.FILE_SIZE in state_obj)): raise DatasetParserException( "State key %s, %s or %s missing" % (StateKey.UNPROCESSED_DATA, StateKey.IN_PROCESS_DATA, StateKey.FILE_SIZE)) # store both the start and end point for this read of data within the file if state_obj[StateKey.UNPROCESSED_DATA] is None: self._position = [0, 0] else: self._position = [ state_obj[StateKey.UNPROCESSED_DATA][0][START_IDX], state_obj[StateKey.UNPROCESSED_DATA][0][START_IDX] ] self._record_buffer = [] self._state = state_obj self._read_state = state_obj # it is possible to be in the middle of processing a packet. Since we have to # process a whole packet, which may contain multiple samples, we have to # re-read the entire packet, then throw out the already received samples self._samples_to_throw_out = None self._mid_sample_packets = len(state_obj[StateKey.IN_PROCESS_DATA]) if self._mid_sample_packets > 0 and state_obj[ StateKey.IN_PROCESS_DATA][0][SAMPLES_RETURNED] > 0: self._samples_to_throw_out = state_obj[ StateKey.IN_PROCESS_DATA][0][SAMPLES_RETURNED] # make sure we have cleaned the chunker out of old data so there are no wrap arounds self._chunker.clean_all_chunks()
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. Should be a list with a StateKey.UNPROCESSED_DATA value, a StateKey.IN_PROCESS_DATA value, and StateKey.TIMESTAMP value. The UNPROCESSED_DATA and IN_PROCESS_DATA are both arrays which contain an array of start and end indicies for their respective types of data. The timestamp is an NTP4 format timestamp. @throws DatasetParserException if there is a bad state structure """ log.debug("Setting state to: %s", state_obj) if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not ((StateKey.UNPROCESSED_DATA in state_obj) and \ (StateKey.IN_PROCESS_DATA in state_obj) and \ (StateKey.TIMESTAMP in state_obj)): raise DatasetParserException("Invalid state keys") self._timestamp = state_obj[StateKey.TIMESTAMP] # store both the start and end point for this read of data within the file self._position = [ state_obj[StateKey.UNPROCESSED_DATA][0][0], state_obj[StateKey.UNPROCESSED_DATA][0][0] ] self._record_buffer = [] self._state = state_obj self._read_state = state_obj # it is possible to be in the middle of processing a packet. Since we have to # process a whole packet, which may contain multiple samples, we have to # re-read the entire packet, then throw out the already received samples self._samples_to_throw_out = None self._mid_sample_packets = len(state_obj[StateKey.IN_PROCESS_DATA]) if self._mid_sample_packets > 0 and state_obj[ StateKey.IN_PROCESS_DATA][0][3] > 0: self._samples_to_throw_out = state_obj[ StateKey.IN_PROCESS_DATA][0][3] # make sure we have cleaned the chunker out of old data so there are no wrap arounds self._clean_all_chunker() self._new_seq_flag = True # state has changed, start a new sequence # seek to the first unprocessed position self._stream_handle.seek(state_obj[StateKey.UNPROCESSED_DATA][0][0]) log.debug('Seeking to %d', state_obj[StateKey.UNPROCESSED_DATA][0][0])
def _read_column_labels(self): """ Read the next three lines to populate column data. 1st Row (row 15 of file) == labels 2nd Row (row 16 of file) == units 3rd Row (row 17 of file) == column byte size Currently we are only able to support 3 label line rows. """ # read the label line (should be at row 15 of the file at this point) label_list = self._stream_handle.readline().strip().split() self.num_columns = len(label_list) self._header_dict['labels'] = label_list # the m_present_time label is required to generate particles, raise an exception if it is not found if GliderParticleKey.M_PRESENT_TIME not in label_list: raise DatasetParserException( 'The m_present_time label has not been found, which means the timestamp ' 'cannot be determined for any particles') # read the units line (should be at row 16 of the file at this point) data_unit_list = self._stream_handle.readline().strip().split() data_unit_list_length = len(data_unit_list) # read the number of bytes line (should be at row 17 of the file at this point) num_of_bytes_list = self._stream_handle.readline().strip().split() num_of_bytes_list_length = len(num_of_bytes_list) # number of labels for name, unit, and number of bytes must match if data_unit_list_length != self.num_columns or self.num_columns != num_of_bytes_list_length: raise DatasetParserException( "The number of columns in the labels row: %d, units row: %d, " "and number of bytes row: %d are not equal." % (self.num_columns, data_unit_list_length, num_of_bytes_list_length)) # if the number of columns from the header does not match that in the data, but the rest of the file # has the same number of columns in each line this is not a fatal error, just parse the columns that are present if self._header_dict['sensors_per_cycle'] != self.num_columns: msg = 'sensors_per_cycle from header %d does not match the number of data label columns %d' % \ (self._header_dict['sensors_per_cycle'], self.num_columns) self._exception_callback(SampleException(msg)) log.debug("Label count: %d", self.num_columns)
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (Vel3dKWfpStateKey.POSITION in state_obj) or \ not (Vel3dKWfpStateKey.RECORD_NUMBER in state_obj): raise DatasetParserException("Invalid state keys") self._record_buffer = [] self._state = state_obj self._read_state = state_obj self.input_file.seek(state_obj[Vel3dKWfpStateKey.POSITION])
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. Should be a dict with a StateKey.POSITION value. The position is number of bytes into the file. @throws DatasetParserException if there is a bad state structure """ log.trace("Attempting to set state to: %s", state_obj) if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (StateKey.POSITION in state_obj): raise DatasetParserException("Invalid state keys") self._record_buffer = [] self._state = state_obj self._read_state = state_obj # seek to it self._stream_handle.seek(state_obj[StateKey.POSITION])
def __init__(self, config, stream_handle, state, state_callback, publish_callback, exception_callback, *args, **kwargs): super(CtdmoTelemeteredParser, self).__init__(config, stream_handle, state, self.sieve_function, state_callback, publish_callback, exception_callback, *args, **kwargs) if not CtdmoStateKey.INDUCTIVE_ID in config: raise DatasetParserException("Parser config is missing %s" % CtdmoStateKey.INDUCTIVE_ID)
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not ((StateKey.POSITION in state_obj)): raise DatasetParserException("Missing state key %s" % StateKey.POSITION) if not ((StateKey.START_OF_DATA in state_obj)): raise DatasetParserException("Missing state key %s" % StateKey.START_OF_DATA) self._record_buffer = [] self._state = state_obj self._read_state = state_obj self._chunker.clean_all_chunks() # seek to the position self._stream_handle.seek(state_obj[StateKey.POSITION])
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ log.debug("Attempting to set state to: %s", state_obj) if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not ((StateKey.POSITION in state_obj) and (StateKey.TIMESTAMP in state_obj)): raise DatasetParserException("Invalid state keys") self._timestamp = state_obj[StateKey.TIMESTAMP] self._record_buffer = [] self._state = state_obj self._read_state = state_obj # make sure the chunker is clean of old data self._clean_all_chunker() # seek to the position self._stream_handle.seek(state_obj[StateKey.POSITION])
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): error_message = 'Invalid state structure' log.warn(error_message) raise DatasetParserException(error_message) for key in OptaaStateKey.list(): if not key in state_obj: error_message = '%s missing in state keys' % key log.warn(error_message) raise DatasetParserException(error_message) self._record_buffer = [] self._state = state_obj self._read_state = state_obj self.input_file.seek(state_obj[OptaaStateKey.POSITION])
def __init__(self, config, stream_handle, state, sieve_fn, state_callback, publish_callback, instrument_id): """ @param config The configuration parameters to feed into the parser @param stream_handle An already open file-like filehandle @param state The location in the file to start parsing from. This reflects what has already been published. @param sieve_fn A sieve function that might be added to a handler to appropriate filter out the data @param state_callback The callback method from the agent driver (ultimately the agent) to call back when a state needs to be updated @param publish_callback The callback from the agent driver (and ultimately from the agent) where we send our sample particle to be published into ION @param instrument_id the text string indicating the instrument to monitor, can be 'CT', 'AD', 'FL', 'DO', or 'PH' """ super(MflmParser, self).__init__(config, stream_handle, state, self.sieve_function, state_callback, publish_callback) if instrument_id not in ['CT', 'AD', 'FL', 'DO', 'PH']: raise DatasetParserException('instrument id %s is not recognized', instrument_id) self._instrument_id = instrument_id self._timestamp = 0.0 self._position = [ 0, 0 ] # store both the start and end point for this read of data within the file self._record_buffer = [] # holds list of records # determine the EOF index self._stream_handle.seek(0) all_data = self._stream_handle.read() EOF = len(all_data) self._stream_handle.seek(0) self._new_seq_flag = True # always start a new sequence on init self._chunk_sample_count = [] self._chunk_new_seq = [] self._samples_to_throw_out = None self._mid_sample_packets = 0 self._read_state = { StateKey.TIMESTAMP: 0.0, StateKey.UNPROCESSED_DATA: [[0, EOF]], StateKey.IN_PROCESS_DATA: [] } log.debug('Starting parser') if state: self.set_state(self._state)
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. Should be a list with a StateKey.POSITION value and StateKey.TIMESTAMP value. The position is number of bytes into the file, the timestamp is an NTP4 format timestamp. @throws DatasetParserException if there is a bad state structure """ log.trace("Attempting to set state to: %s", state_obj) if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (StateKey.POSITION in state_obj): raise DatasetParserException("Invalid state keys") self._chunker.buffer = "" self._chunker.raw_chunk_list = [] self._chunker.data_chunk_list = [] self._chunker.nondata_chunk_list = [] self._record_buffer = [] self._state = state_obj self._read_state = state_obj self._stream_handle.seek(state_obj[StateKey.POSITION])
def set_state(self, state_obj): """ Set the value of the state object for this parser @param state_obj The object to set the state to. @throws DatasetParserException if there is a bad state structure """ if not isinstance(state_obj, dict): raise DatasetParserException("Invalid state structure") if not (Vel3dKWfpStcStateKey.FIRST_RECORD in state_obj) or \ not (Vel3dKWfpStcStateKey.POSITION in state_obj) or \ not (Vel3dKWfpStcStateKey.VELOCITY_END in state_obj): raise DatasetParserException("Invalid state keys") # # Initialize parent data. # self._timestamp = 0.0 self._record_buffer = [] self._state = state_obj self._read_state = state_obj self.input_file.seek(self._read_state[Vel3dKWfpStcStateKey.POSITION], 0)
def __init__(self, config, stream_handle, state, state_callback, publish_callback, exception_callback, *args, **kwargs): super(CtdmoRecoveredCoParser, self).__init__(config, stream_handle, state, self.sieve_function, state_callback, publish_callback, exception_callback, *args, **kwargs) # # Verify that the required parameters are in the parser configuration. # if not CtdmoStateKey.INDUCTIVE_ID in config: raise DatasetParserException("Parser config is missing %s" % CtdmoStateKey.INDUCTIVE_ID)
def _read_data(self, data_record): """ Read in the column labels, data type, number of bytes of each data type, and the data from an ASCII glider data file. """ log.debug("_read_data: Data Record: %s", data_record) data_dict = {} num_columns = self._header_dict['sensors_per_cycle'] data_labels = self._header_dict['labels'] #data_units = self._header_dict['data_units'] num_bytes = self._header_dict['num_of_bytes'] data = data_record.strip().split() log.trace("Split data: %s", data) if num_columns != len(data): raise DatasetParserException( 'Glider data file does not have the ' + 'same number of columns as described ' + 'in the header.\n' + 'Described: %d, Actual: %d' % (num_columns, len(data))) # extract record to dictionary for ii in range(num_columns): log.trace("_read_data: index: %d label: %s, value: %s", ii, data_labels[ii], data[ii]) if (num_bytes[ii] == 1) or (num_bytes[ii] == 2): str2data = int elif (num_bytes[ii] == 4) or (num_bytes[ii] == 8): str2data = float # check to see if this is a latitude/longitude string if ('_lat' in data_labels[ii]) or ('_lon' in data_labels[ii]): # convert latitiude/longitude strings to decimal degrees value = self._string_to_ddegrees(data[ii]) else: value = str2data(data[ii]) data_dict[data_labels[ii]] = { 'Name': data_labels[ii], #'Units': data_units[ii], #'Number_of_Bytes': int(num_bytes[ii]), 'Data': value } log.trace("Data dict parsed: %s", data_dict) return data_dict
def __init__(self, config, stream_handle, exception_callback, filename, is_telemetered): super(OptaaDjDclParser, self).__init__(config, stream_handle, exception_callback) if is_telemetered: self.instrument_particle_class = OptaaDjDclTelemeteredInstrumentDataParticle self.metadata_particle_class = OptaaDjDclTelemeteredMetadataDataParticle else: self.instrument_particle_class = OptaaDjDclRecoveredInstrumentDataParticle self.metadata_particle_class = OptaaDjDclRecoveredMetadataDataParticle # Extract the start date and time from the filename and convert # it to the format expected for the output particle. # Calculate the ntp_time timestamp, the number of seconds since Jan 1, 1900, # based on the date and time from the filename. # This is the start time. Timestamps for each particle are derived from # the start time. filename_match = FILENAME_MATCHER.search(filename) if filename_match is not None: self.start_date = \ filename_match.group(GROUP_YEAR) + '-' + \ filename_match.group(GROUP_MONTH) + '-' + \ filename_match.group(GROUP_DAY) + ' ' + \ filename_match.group(GROUP_HOUR) + ':' + \ filename_match.group(GROUP_MINUTE) + ':' + \ filename_match.group(GROUP_SECOND) timestamp = (int(filename_match.group(GROUP_YEAR)), int(filename_match.group(GROUP_MONTH)), int(filename_match.group(GROUP_DAY)), int(filename_match.group(GROUP_HOUR)), int(filename_match.group(GROUP_MINUTE)), int(filename_match.group(GROUP_SECOND)), 0, 0, 0) # The timestamp for each particle is: # timestamp = start_time_from_file_name + (tn - t0) # where t0 is the time since power-up in the first record. elapsed_seconds = calendar.timegm(timestamp) self.ntp_time = ntplib.system_to_ntp_time(elapsed_seconds) else: error_message = 'Invalid filename %s' % filename log.warn(error_message) raise DatasetParserException(error_message)
def parse(basePythonCodePath, sourceFilePath, particleDataHdlrObj, serialNumToInductiveIdMapHandler): """ This is the method called by Uframe :param basePythonCodePath This is the file system location of mi-dataset :param sourceFilePath This is the full path and filename of the file to be parsed :param particleDataHdlrObj Java Object to consume the output of the parser :return particleDataHdlrObj """ log = get_logger() with open(sourceFilePath, 'r') as stream_handle: def exception_callback(exception): log.debug("Exception: %s", exception) particleDataHdlrObj.setParticleDataCaptureFailure() # extract the serial number from the file name serial_num = get_serial_num_from_filepath(sourceFilePath) # retrieve the inductive ID associated with the serial number induct_id = serialNumToInductiveIdMapHandler.getInductiveId(serial_num) if not induct_id: raise DatasetParserException( "Unable to obtain the inductive ID associated with serial num %d", serial_num) parser_config = { DataSetDriverConfigKeys.PARTICLE_MODULE: 'mi.dataset.parser.ctdmo_ghqr_sio', DataSetDriverConfigKeys.PARTICLE_CLASS: ['CtdmoGhqrRecoveredInstrumentDataParticle'], INDUCTIVE_ID_KEY: induct_id } parser = CtdmoGhqrRecoveredCtParser(parser_config, stream_handle, exception_callback) # create and instance of the concrete driver class defined below driver = DataSetDriver(parser, particleDataHdlrObj) driver.processFileStream() return particleDataHdlrObj
def get_serial_num_from_filepath(filepath): """ Parse the serial number from the file path :param filepath: The full path of the file to extract the serial number from the name :return: serial number """ # get just the filename from the full path filename = os.path.basename(filepath) # match the filename, serial number is the first group filename_match = FILENAME_MATCHER.match(filename) # can't run parser without the serial number, raise an exception if it can't be found if not filename_match: raise DatasetParserException( "Unable to parse serial number from file name %s", filename) # return serial number as an int return int(filename_match.group(1))
def _read_data(self, data_record): """ Read in the column labels, data type, number of bytes of each data type, and the data from an ASCII glider data file. """ data_dict = {} data_labels = self._header_dict['labels'] data = data_record.strip().split() if self.num_columns != len(data): err_msg = "GliderParser._read_data(): Num Of Columns NOT EQUAL to Num of Data items: " + \ "Expected Columns= %s vs Actual Data= %s" % (self.num_columns, len(data)) log.error(err_msg) raise DatasetParserException(err_msg) # extract record to dictionary for ii, value in enumerate(data): label = data_labels[ii] data_dict[label] = value return data_dict
def __init__(self, config, stream_handle, exception_callback): # # Verify that the required parameters are in the parser configuration. # if not INDUCTIVE_ID_KEY in config: raise DatasetParserException("Parser config is missing %s" % INDUCTIVE_ID_KEY) # # File is ASCII with records separated by newlines. # super(CtdmoGhqrRecoveredCtParser, self).__init__(config, stream_handle, exception_callback) # # set flags to indicate the end of Configuration has not been reached # and the serial number has not been found. # self._serial_number = None self._end_config = False self.input_file = stream_handle
def __init__(self, config, stream_handle, exception_callback, data_record_regex, header_key_list=None, ignore_matcher=None): """ This method is a constructor that will instantiate an CsppParser object. @param config The configuration for this CsppParser parser @param stream_handle The handle to the data stream containing the cspp data @param exception_callback The function to call to report exceptions @param data_record_regex The data regex that should be used to obtain data records @param header_key_list The list of header keys expected within a header @param ignore_matcher A matcher from a regex to use to ignore expected junk lines """ self._data_record_matcher = None self._header_and_first_data_record_matcher = None self._ignore_matcher = ignore_matcher # Ensure that we have a data regex if data_record_regex is None: log.warn('A data_record_regex is required, but None was given') raise DatasetParserException("Must provide a data_record_regex") else: self._data_record_matcher = re.compile(data_record_regex) # Build up the header state dictionary using the default her key list ot one that was provided self._header_state = {} if header_key_list is None: header_key_list = DEFAULT_HEADER_KEY_LIST for header_key in header_key_list: self._header_state[header_key] = None # Obtain the particle classes dictionary from the config data if DataSetDriverConfigKeys.PARTICLE_CLASSES_DICT in config: particle_classes_dict = config.get( DataSetDriverConfigKeys.PARTICLE_CLASSES_DICT) # Set the metadata and data particle classes to be used later if METADATA_PARTICLE_CLASS_KEY in particle_classes_dict and \ DATA_PARTICLE_CLASS_KEY in particle_classes_dict: self._data_particle_class = particle_classes_dict.get( DATA_PARTICLE_CLASS_KEY) self._metadata_particle_class = particle_classes_dict.get( METADATA_PARTICLE_CLASS_KEY) else: log.warning( 'Configuration missing metadata or data particle class key in particle classes dict' ) raise ConfigurationException( 'Configuration missing metadata or data particle class key in particle classes dict' ) else: log.warning('Configuration missing particle classes dict') raise ConfigurationException( 'Configuration missing particle classes dict') # Initialize the record buffer to an empty list self._record_buffer = [] # Initialize the metadata flag self._metadata_extracted = False # Call the superclass constructor super(CsppParser, self).__init__(config, stream_handle, exception_callback)
def __init__(self, config, state, stream_handle, state_callback, publish_callback, exception_callback, data_record_regex, header_key_list=None, ignore_matcher=None, *args, **kwargs): """ This method is a constructor that will instantiate an CsppParser object. @param config The configuration for this CsppParser parser @param state The state the CsppParser should use to initialize itself @param stream_handle The handle to the data stream containing the cspp data @param state_callback The function to call upon detecting state changes @param publish_callback The function to call to provide particles @param exception_callback The function to call to report exceptions @param data_record_regex The data regex that should be used to obtain data records @param header_key_list The list of header keys expected within a header @param ignore_regex A regex to use to ignore expected junk lines """ self._data_record_matcher = None self._header_and_first_data_record_matcher = None self._ignore_matcher = ignore_matcher # Ensure that we have a data regex if data_record_regex is None: log.warn('A data_record_regex is required, but None was given') raise DatasetParserException("Must provide a data_record_regex") else: self._data_record_matcher = re.compile(data_record_regex) # Build up the header state dictionary using the default her key list ot one that was provided self._header_state = {} if header_key_list is None: header_key_list = DEFAULT_HEADER_KEY_LIST for header_key in header_key_list: self._header_state[header_key] = None # Obtain the particle classes dictionary from the config data if DataSetDriverConfigKeys.PARTICLE_CLASSES_DICT in config: particle_classes_dict = config.get( DataSetDriverConfigKeys.PARTICLE_CLASSES_DICT) # Set the metadata and data particle classes to be used later if METADATA_PARTICLE_CLASS_KEY in particle_classes_dict and \ DATA_PARTICLE_CLASS_KEY in particle_classes_dict: self._data_particle_class = particle_classes_dict.get( DATA_PARTICLE_CLASS_KEY) self._metadata_particle_class = particle_classes_dict.get( METADATA_PARTICLE_CLASS_KEY) else: log.warning( 'Configuration missing metadata or data particle class key in particle classes dict' ) raise ConfigurationException( 'Configuration missing metadata or data particle class key in particle classes dict' ) else: log.warning('Configuration missing particle classes dict') raise ConfigurationException( 'Configuration missing particle classes dict') # Initialize the record buffer to an empty list self._record_buffer = [] # Initialize the read state self._read_state = { StateKey.POSITION: 0, StateKey.METADATA_EXTRACTED: False } # Call the superclass constructor super(CsppParser, self).__init__( config, stream_handle, state, partial(StringChunker.regex_sieve_function, regex_list=[SIEVE_MATCHER]), state_callback, publish_callback, exception_callback, *args, **kwargs) # If provided a state, set it. This needs to be done post superclass __init__ if state: self.set_state(state)
def _read_file_definition(self): """ Read the first 14 lines of the data file for the file definitions, values are colon delimited key value pairs. The pairs are parsed and stored in header_dict member. """ row_count = 0 # # THIS METHOD ASSUMES A 14 ROW HEADER # If the number of header row lines in the glider ASCII input file changes from 14, # this method will NOT WORK num_hdr_lines = 14 header_pattern = r'(.*): (.*)$' header_re = re.compile(header_pattern) line = self._stream_handle.readline() while line and row_count < num_hdr_lines: match = header_re.match(line) if match: key = match.group(1) value = match.group(2) value = value.strip() # update num_hdr_lines based on the header info. if key == 'num_ascii_tags': # this key has a required value of 14, otherwise we don't know how to parse the file if int(value) != num_hdr_lines: raise DatasetParserException( "Header must be %d rows, but it is %s" % (num_hdr_lines, value)) elif key == 'num_label_lines': # this key has a required value of 3, otherwise we don't know how to parse the file if int(value) != 3: raise DatasetParserException( "There must be 3 Label lines from the header for this parser" ) elif key == 'sensors_per_cycle': # save for future use self._header_dict[key] = int(value) elif key in [ 'filename_label', 'mission_name', 'fileopen_time' ]: # create a dictionary of these 3 key/value pairs strings from # the header rows that need to be saved for future use self._header_dict[key] = value else: log.warn("Failed to parse header row: %s.", line) row_count += 1 # only read the header lines in this method so make sure we stop if row_count < num_hdr_lines: line = self._stream_handle.readline() if row_count < num_hdr_lines: log.error('Not enough data lines for a full header') raise DatasetParserException( 'Not enough data lines for a full header')
def __init__(self, config, stream_handle, state, state_callback, publish_callback, exception_callback, filename, instrument_particle_class, metadata_particle_class, *args, **kwargs): super(OptaaDjDclParser, self).__init__(config, stream_handle, state, self.sieve_function, state_callback, publish_callback, exception_callback, *args, **kwargs) self.input_file = stream_handle # If there's an existing state, update to it. # Otherwise default the position within the file to the beginning # and metadata particle not having been generated. if state is not None: self.set_state(state) else: self.set_state({OptaaStateKey.POSITION: 0, OptaaStateKey.METADATA_GENERATED: False, OptaaStateKey.TIME_SINCE_POWER_UP: 0.0}) # Extract the start date and time from the filename and convert # it to the format expected for the output particle. # Calculate the ntp_time timestamp, the number of seconds since Jan 1, 1900, # based on the date and time from the filename. # This is the start time. Timestamps for each particle are derived from # the start time. filename_match = FILENAME_MATCHER.match(filename) if filename_match is not None: self.start_date = \ filename_match.group(GROUP_YEAR) + '-' + \ filename_match.group(GROUP_MONTH) + '-' + \ filename_match.group(GROUP_DAY) + ' ' + \ filename_match.group(GROUP_HOUR) + ':' + \ filename_match.group(GROUP_MINUTE) + ':' + \ filename_match.group(GROUP_SECOND) timestamp = ( int(filename_match.group(GROUP_YEAR)), int(filename_match.group(GROUP_MONTH)), int(filename_match.group(GROUP_DAY)), int(filename_match.group(GROUP_HOUR)), int(filename_match.group(GROUP_MINUTE)), int(filename_match.group(GROUP_SECOND)), 0, 0, 0) # The timestamp for each particle is: # timestamp = start_time_from_file_name + (tn - t0) # where t0 is the time since power-up in the first record. elapsed_seconds = calendar.timegm(timestamp) self.ntp_time = ntplib.system_to_ntp_time(elapsed_seconds) - \ self._read_state[OptaaStateKey.TIME_SINCE_POWER_UP] else: error_message = 'Invalid filename %s' % filename log.warn(error_message) raise DatasetParserException(error_message) # Save the names of the particle classes to be generated. self.instrument_particle_class = instrument_particle_class self.metadata_particle_class = metadata_particle_class