def make_some_data(self): import numpy as np stream_id = 'I am very special' definition = SBE37_CDM_stream_definition() definition.stream_resource_id = stream_id self.couch.create(definition) total = 200 n = 10 # at most n records per granule i = 0 while i < total: r = random.randint(1,n) psc = PointSupplementConstructor(point_definition=definition, stream_id=stream_id) for x in xrange(r): i+=1 point_id = psc.add_point(time=i, location=(0,0,0)) psc.add_scalar_point_coverage(point_id=point_id, coverage_id='temperature', value=np.random.normal(loc=48.0,scale=4.0, size=1)[0]) psc.add_scalar_point_coverage(point_id=point_id, coverage_id='pressure', value=np.float32(1.0)) psc.add_scalar_point_coverage(point_id=point_id, coverage_id='conductivity', value=np.float32(2.0)) granule = psc.close_stream_granule() hdf_string = granule.identifiables[definition.data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() with open(FileSystem.get_hierarchical_url(FS.CACHE, '%s.hdf5' % sha1),'w') as f: f.write(hdf_string) granule.identifiables[definition.data_stream_id].values = '' self.couch.create(granule)
def read_persisted_cache(self, sha1, encoding): byte_string = None path = FileSystem.get_hierarchical_url(FS.CACHE,sha1,'.%s' % encoding) try: with open(path, 'r') as f: byte_string = f.read() except IOError as e: raise BadRequest(e.message) return byte_string
def process_stream(self, packet, dset_config): """ Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the stream such as store in hfd_storage, couch_storage. @param: packet The incoming data stream of type stream. @param: dset_config The dset_config telling this method what to do with the incoming data stream. """ ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False} if dset_config is None: log.info('No dataset config for this stream!') return # Get back to the serialized form - the process receives only the IonObject after the interceptor stack has decoded it... simple_dict = ion_serializer.serialize(packet) #packet is an ion_object byte_string = msgpack.packb(simple_dict, default=encode_ion) encoding_type = 'ion_msgpack' # Persisted sha1 is crafted from the byte string msgpack creates calculated_sha1 = hashlib.sha1(byte_string).hexdigest().upper() dataset_granule = { 'stream_id' : dset_config.stream_id, 'dataset_id' : dset_config.dataset_id, 'persisted_sha1' : calculated_sha1, 'encoding_type' : encoding_type, 'ts_create' : get_ion_ts() } self.persist_immutable(dataset_granule) filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type) with open(filename, mode='wb') as f: f.write(byte_string) f.close() return ingestion_attributes
def persist_file(self, file_data='', digest='', metadata=None): ds = self.container.datastore_manager.get_datastore( self.datastore_name, DS.DS_PROFILE.FILESYSTEM) validate_is_instance(file_data, basestring, "File or binary data must be a string.") validate_is_instance(metadata, File) if self.list_files(metadata.name + metadata.extension): raise BadRequest('%s already exists.' % metadata.name + metadata.extension) digest_ = sha224(file_data).hexdigest() if digest: validate_equal( digest, digest_, "The provided digest does not match the file's digest. Ensure you are using sha224." ) else: digest = digest_ extension = metadata.extension if '.' in metadata.name: t = metadata.name.split('.') metadata.name, metadata.extension = ('.'.join(t[:-1]), '.' + t[-1]) url = FileSystem.get_hierarchical_url(FS.CACHE, digest, extension) try: with open(url, 'w+b') as f: f.write(file_data) f.close() except Exception: log.exception('Failed to write %s', url) raise BadRequest('Could not successfully write file data') if metadata.name[0] != '/': metadata.name = '/' + metadata.name metadata.url = url metadata.digest = digest metadata.created_date = IonTime().to_string() metadata.modified_date = IonTime().to_string() metadata.size = len(file_data) doc_id, rev_id = ds.create(metadata) return doc_id
def make_some_data(self): import numpy as np stream_id = 'I am very special' definition = SBE37_CDM_stream_definition() definition.stream_resource_id = stream_id self.couch.create(definition) total = 200 n = 10 # at most n records per granule i = 0 while i < total: r = random.randint(1, n) psc = PointSupplementConstructor(point_definition=definition, stream_id=stream_id) for x in xrange(r): i += 1 point_id = psc.add_point(time=i, location=(0, 0, 0)) psc.add_scalar_point_coverage( point_id=point_id, coverage_id='temperature', value=np.random.normal(loc=48.0, scale=4.0, size=1)[0]) psc.add_scalar_point_coverage(point_id=point_id, coverage_id='pressure', value=np.float32(1.0)) psc.add_scalar_point_coverage(point_id=point_id, coverage_id='conductivity', value=np.float32(2.0)) granule = psc.close_stream_granule() hdf_string = granule.identifiables[ definition.data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() with open( FileSystem.get_hierarchical_url(FS.CACHE, '%s.hdf5' % sha1), 'w') as f: f.write(hdf_string) granule.identifiables[definition.data_stream_id].values = '' self.couch.create(granule)
def _parse_granule(self, granule): ''' @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset @param granule raw granule straight from couch @return metadata in the granule as well as the granule itself if valid. ''' granule.stream_resource_id = self.stream_id element_count_id = self.element_count_id encoding_id = self.encoding_id record_count = granule.identifiables[element_count_id].value sha1 = granule.identifiables[encoding_id].sha1 or None # If there are no records then this is not a proper granule if not (record_count > 0): log.debug('Granule had no record count discarding.') return None # No encoding, no packet if not encoding_id in granule.identifiables: log.debug('Granule had no encoding discarding.') return None if not sha1: log.debug('Granule had no sha1') return None filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5') if not os.path.exists(filepath): log.debug('File with sha1 does not exist') return None return { 'granule':granule, 'records':record_count, 'sha1':sha1 }
def _parse_granule(self, granule): ''' @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset @param granule raw granule straight from couch @return metadata in the granule as well as the granule itself if valid. ''' granule.stream_resource_id = self.stream_id element_count_id = self.element_count_id encoding_id = self.encoding_id record_count = granule.identifiables[element_count_id].value sha1 = granule.identifiables[encoding_id].sha1 or None # If there are no records then this is not a proper granule if not (record_count > 0): log.debug('Granule had no record count discarding.') return None # No encoding, no packet if not encoding_id in granule.identifiables: log.debug('Granule had no encoding discarding.') return None if not sha1: log.debug('Granule had no sha1') return None filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5') if not os.path.exists(filepath): log.debug('File with sha1 does not exist') return None return {'granule': granule, 'records': record_count, 'sha1': sha1}
def persist_file(self, file_data='', digest='', metadata=None): ds = self.container.datastore_manager.get_datastore(self.datastore_name, DS.DS_PROFILE.FILESYSTEM) validate_is_instance(file_data,basestring, "File or binary data must be a string.") validate_is_instance(metadata,File) if self.list_files(metadata.name + metadata.extension): raise BadRequest('%s already exists.' % metadata.name + metadata.extension) digest_ = sha224(file_data).hexdigest() if digest: validate_equal(digest,digest_,"The provided digest does not match the file's digest. Ensure you are using sha224.") else: digest = digest_ extension = metadata.extension if '.' in metadata.name: t = metadata.name.split('.') metadata.name, metadata.extension = ('.'.join(t[:-1]), '.' + t[-1]) url = FileSystem.get_hierarchical_url(FS.CACHE, digest, extension) try: with open(url,'w+b') as f: f.write(file_data) f.close() except Exception: log.exception('Failed to write %s', url) raise BadRequest('Could not successfully write file data') if metadata.name[0] != '/': metadata.name = '/' + metadata.name metadata.url = url metadata.digest = digest metadata.created_date = IonTime().to_string() metadata.modified_date = IonTime().to_string() metadata.size = len(file_data) doc_id, rev_id = ds.create(metadata) return doc_id
def process_stream(self, packet, dset_config): """ Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the stream such as store in hfd_storage, couch_storage. @param: packet The incoming data stream of type stream. @param: dset_config The dset_config telling this method what to do with the incoming data stream. """ ingestion_attributes = { 'variables': [], 'number_of_records': -1, 'updated_metadata': False, 'updated_data': False } if dset_config is None: log.info('No dataset config for this stream!') return values_string = '' sha1 = '' encoding_type = '' for key, value in packet.identifiables.iteritems(): if isinstance(value, DataStream): values_string = value.values value.values = '' elif isinstance(value, Encoding): sha1 = value.sha1 encoding_type = value.encoding_type elif isinstance(value, Coverage): ingestion_attributes['variables'].append(key) elif isinstance(value, CountElement): ingestion_attributes['number_of_records'] = value.value if dset_config.archive_metadata is True: log.debug("Persisting data....") ingestion_attributes['updated_metadata'] = True self.persist_immutable(packet) if dset_config.archive_data is True: #@todo - grab the filepath to save the hdf string somewhere.. ingestion_attributes['updated_data'] = True if values_string: calculated_sha1 = hashlib.sha1( values_string).hexdigest().upper() filename = FileSystem.get_hierarchical_url( FS.CACHE, calculated_sha1, ".%s" % encoding_type) if sha1 != calculated_sha1: raise IngestionWorkerException( 'The sha1 stored is different than the calculated from the received hdf_string' ) #log.warn('writing to filename: %s' % filename) with open(filename, mode='wb') as f: f.write(values_string) f.close() else: log.warn("Nothing to write!") return ingestion_attributes
def _get_coverage(cls,dataset_id): filename = FileSystem.get_hierarchical_url(FS.CACHE, dataset_id, '.cov') coverage = SimplexCoverage.load(filename) return coverage
def _persist_coverage(cls, dataset_id, coverage): validate_is_instance(coverage,SimplexCoverage,'Coverage is not an instance of SimplexCoverage: %s' % type(coverage)) filename = FileSystem.get_hierarchical_url(FS.CACHE, dataset_id, '.cov') SimplexCoverage.save(coverage, filename, use_ascii=False)
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i == 0: granule = msgs[0]['granule'] psc = PointSupplementConstructor( point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][ 0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([ FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list ]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row, value in data.iteritems(): value_path = self._find_vp(pairs, row) codec.add_hdf_dataset(value_path, nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) return granule
def process_stream(self, packet, dset_config): """ Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the stream such as store in hfd_storage, couch_storage. @param: packet The incoming data stream of type stream. @param: dset_config The dset_config telling this method what to do with the incoming data stream. """ ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False} if dset_config is None: log.info('No dataset config for this stream!') return values_string = '' sha1 = '' encoding_type = '' for key,value in packet.identifiables.iteritems(): if isinstance(value, DataStream): values_string = value.values value.values='' elif isinstance(value, Encoding): sha1 = value.sha1 encoding_type = value.encoding_type elif isinstance(value, Coverage): ingestion_attributes['variables'].append(key) elif isinstance(value, CountElement): ingestion_attributes['number_of_records'] = value.value if dset_config.archive_metadata is True: log.debug("Persisting data....") ingestion_attributes['updated_metadata'] = True self.persist_immutable(packet ) if dset_config.archive_data is True: #@todo - grab the filepath to save the hdf string somewhere.. ingestion_attributes['updated_data'] = True if values_string: calculated_sha1 = hashlib.sha1(values_string).hexdigest().upper() filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type) if sha1 != calculated_sha1: raise IngestionWorkerException('The sha1 stored is different than the calculated from the received hdf_string') #log.warn('writing to filename: %s' % filename) with open(filename, mode='wb') as f: f.write(values_string) f.close() else: log.warn("Nothing to write!") return ingestion_attributes
def test_raw_stream_integration(self): cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here to run as a script (don't forget the imports of course!) #----------------------------- # Create some service clients... pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) # declare some handy variables datastore_name = 'test_dm_integration' ### ### In the beginning there was one stream definitions... ### # create a stream definition for the data from the ctd simulator raw_ctd_stream_def = SBE37_RAW_stream_definition() raw_ctd_stream_def_id = pubsub_management_service.create_stream_definition( container=raw_ctd_stream_def, name='Simulated RAW CTD data') ### ### And two process definitions... ### # one for the ctd simulator... producer_definition = ProcessDefinition() producer_definition.executable = { 'module': 'ion.processes.data.raw_stream_publisher', 'class': 'RawStreamPublisher' } raw_ctd_sim_procdef_id = process_dispatcher.create_process_definition( process_definition=producer_definition) #--------------------------- # Set up ingestion - this is an operator concern - not done by SA in a deployed system #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), number_of_workers=1) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #--------------------------- # Set up the producer (CTD Simulator) #--------------------------- # Create the stream raw_ctd_stream_id = pubsub_management_service.create_stream( stream_definition_id=raw_ctd_stream_def_id) # Set up the datasets raw_ctd_dataset_id = dataset_management_service.create_dataset( stream_id=raw_ctd_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule') # Configure ingestion of this dataset raw_ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id=raw_ctd_dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id= ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service # Start the ctd simulator to produce some data configuration = { 'process': { 'stream_id': raw_ctd_stream_id, } } raw_sim_pid = process_dispatcher.schedule_process( process_definition_id=raw_ctd_sim_procdef_id, configuration=configuration) ### ### Make a subscriber in the test to listen for salinity data ### raw_subscription_id = pubsub_management_service.create_subscription( query=StreamQuery([ raw_ctd_stream_id, ]), exchange_name='raw_test', name="test raw subscription", ) # this is okay - even in cei mode! pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process, node=cc.node) result = gevent.event.AsyncResult() results = [] def message_received(message, headers): # Heads log.warn('Raw data received!') results.append(message) if len(results) > 3: result.set(True) subscriber = subscriber_registrar.create_subscriber( exchange_name='raw_test', callback=message_received) subscriber.start() # after the queue has been created it is safe to activate the subscription pubsub_management_service.activate_subscription( subscription_id=raw_subscription_id) # Assert that we have received data assertions(result.get(timeout=10)) # stop the flow parse the messages... process_dispatcher.cancel_process( raw_sim_pid ) # kill the ctd simulator process - that is enough data gevent.sleep(1) for message in results: sha1 = message.identifiables['stream_encoding'].sha1 data = message.identifiables['data_stream'].values filename = FileSystem.get_hierarchical_url(FS.CACHE, sha1, ".raw") with open(filename, 'r') as f: assertions(data == f.read())
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i==0: granule = msgs[0]['granule'] psc = PointSupplementConstructor(point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row,value in data.iteritems(): value_path = self._find_vp(pairs,row) codec.add_hdf_dataset(value_path,nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) return granule