def _splice_coverage(cls, dataset_id, scov): file_root = FileSystem.get_url(FS.CACHE, 'datasets') vcov = cls._get_coverage(dataset_id, mode='a') scov_pth = scov.persistence_dir if isinstance(vcov.reference_coverage, SimplexCoverage): ccov = ComplexCoverage( file_root, uuid4().hex, 'Complex coverage for %s' % dataset_id, reference_coverage_locs=[ vcov.head_coverage_path, ], parameter_dictionary=ParameterDictionary(), complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) log.info('Creating Complex Coverage: %s', ccov.persistence_dir) ccov.append_reference_coverage(scov_pth) ccov_pth = ccov.persistence_dir ccov.close() vcov.replace_reference_coverage(ccov_pth) elif isinstance(vcov.reference_coverage, ComplexCoverage): log.info('Appending simplex coverage to complex coverage') #vcov.reference_coverage.append_reference_coverage(scov_pth) dir_path = vcov.reference_coverage.persistence_dir vcov.close() ccov = AbstractCoverage.load(dir_path, mode='a') ccov.append_reference_coverage(scov_pth) ccov.refresh() ccov.close() vcov.refresh() vcov.close()
def __init__(self, name = None): """ @param name The name of the dataset """ # generate a random name for the filename if it has not been provided. self.filename = FileSystem.get_url(fs=FS.TEMP, filename=name or random_name(), ext='encoder.hdf5') # Using inline imports to put off making hdf/numpy required dependencies import h5py # open an hdf file on disk - in /tmp to write data to since we can't yet do in memory try: log.debug("Creating h5py file object for the encoder at %s" % self.filename) if os.path.isfile(self.filename): # if file exists, then append to it self.h5pyfile = h5py.File(self.filename, mode = 'r+', driver='core') else: # if file does not already exist, write a new one self.h5pyfile = h5py.File(self.filename, mode = 'w', driver='core') assert self.h5pyfile, 'No h5py file object created.' except IOError: log.debug("Error opening file for the HDFEncoder! ") raise HDFEncoderException("Error while trying to open file. ") except AssertionError as err: log.debug(err.message) raise HDFEncoderException(err.message)
def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain,temporal_domain): pdict = ParameterDictionary.load(parameter_dict) sdom = GridDomain.load(spatial_domain) tdom = GridDomain.load(temporal_domain) file_root = FileSystem.get_url(FS.CACHE,'datasets') scov = SimplexCoverage(file_root,dataset_id,description or dataset_id,parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom, inline_data_writes=self.inline_data_writes) return scov
def on_start(self): super(TransformCapture, self).on_start() # #@todo: Remove debugging statements log.debug('(Transform: %s) Starting...', self.name) self.file_name = self.CFG.get_safe( 'process.file_name', FileSystem.get_url(FS.TEMP, 'transform_output'))
def create_known(dataset_name, rootgrp_name, grp_name): """ A known array to compare against during tests """ known_array = numpy.random.rand(10, 20) filename = FileSystem.get_url(FS.TEMP, random_name(), ".hdf5") # Write an hdf file with known values to compare against h5pyfile = h5py.File(filename, mode='w', driver='core') grp = h5pyfile.create_group(rootgrp_name) subgrp = grp.create_group(grp_name) dataset = subgrp.create_dataset(dataset_name, known_array.shape, known_array.dtype.str, compression='gzip', compression_opts=4, maxshape=(None, None)) dataset.write_direct(known_array) h5pyfile.close() # convert the hdf file into a binary string f = open(filename, mode='rb') # read the binary string representation of the file known_hdf_as_string = f.read( ) # this is a known string to compare against during tests f.close() # cleaning up FileSystem.unlink(f.name) return known_array, known_hdf_as_string
def _splice_coverage(cls, dataset_id, scov): file_root = FileSystem.get_url(FS.CACHE,'datasets') vcov = cls._get_coverage(dataset_id,mode='a') scov_pth = scov.persistence_dir if isinstance(vcov.reference_coverage, SimplexCoverage): ccov = ComplexCoverage(file_root, uuid4().hex, 'Complex coverage for %s' % dataset_id, reference_coverage_locs=[vcov.head_coverage_path,], parameter_dictionary=ParameterDictionary(), complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) log.info('Creating Complex Coverage: %s', ccov.persistence_dir) ccov.append_reference_coverage(scov_pth) ccov_pth = ccov.persistence_dir ccov.close() vcov.replace_reference_coverage(ccov_pth) elif isinstance(vcov.reference_coverage, ComplexCoverage): log.info('Appending simplex coverage to complex coverage') #vcov.reference_coverage.append_reference_coverage(scov_pth) dir_path = vcov.reference_coverage.persistence_dir vcov.close() ccov = AbstractCoverage.load(dir_path, mode='a') ccov.append_reference_coverage(scov_pth) ccov.refresh() ccov.close() vcov.refresh() vcov.close()
def create_known(dataset_name, rootgrp_name, grp_name): """ A known array to compare against during tests """ known_array = numpy.ones((10,20)) filename = FileSystem.get_url(FS.TEMP,random_name(), ".hdf5") # Write an hdf file with known values to compare against h5pyfile = h5py.File(filename, mode = 'w', driver='core') grp = h5pyfile.create_group(rootgrp_name) subgrp = grp.create_group(grp_name) dataset = subgrp.create_dataset(dataset_name, known_array.shape, known_array.dtype.str, maxshape=(None,None)) dataset.write_direct(known_array) h5pyfile.close() # convert the hdf file into a binary string f = open(filename, mode='rb') # read the binary string representation of the file known_hdf_as_string = f.read() # this is a known string to compare against during tests f.close() # cleaning up FileSystem.unlink(f.name) return known_array, known_hdf_as_string
def upload_qc(): upload_folder = FileSystem.get_url(FS.TEMP, 'uploads') try: object_store = Container.instance.object_store # required fields upload = request.files['file'] # <input type=file name="file"> if upload: # upload file - run filename through werkzeug.secure_filename filename = secure_filename(upload.filename) path = os.path.join(upload_folder, filename) upload_time = time.time() upload.save(path) filetype = _check_magic( upload) or 'CSV' # Either going to be ZIP or CSV, probably # register upload file_upload_context = { 'name': 'User uploaded QC file %s' % filename, 'filename': filename, 'filetype': filetype, # only CSV, no detection necessary 'path': path, 'upload_time': upload_time, 'status': 'File uploaded to server' } fuc_id, _ = object_store.create_doc(file_upload_context) # client to process dispatch pd_client = ProcessDispatcherServiceClient() # create process definition process_definition = ProcessDefinition( name='upload_qc_processor', executable={ 'module': 'ion.processes.data.upload.upload_qc_processing', 'class': 'UploadQcProcessing' }) process_definition_id = pd_client.create_process_definition( process_definition) # create process process_id = pd_client.create_process(process_definition_id) #schedule process config = DotDict() config.process.fuc_id = fuc_id pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config) log.info('UploadQcProcessing process created %s' % pid) # response - only FileUploadContext ID and determined filetype for UX display resp = {'fuc_id': fuc_id} return gateway_json_response(resp) raise BadRequest('Invalid Upload') except Exception as e: return build_error_response(e)
def _create_coverage(self, dataset_id, parameter_dict_id, time_dom, spatial_dom): pd = self.dataset_management_client.read_parameter_dictionary(parameter_dict_id) pdict = ParameterDictionary.load(pd) sdom = GridDomain.load(spatial_dom.dump()) tdom = GridDomain.load(time_dom.dump()) file_root = FileSystem.get_url(FS.CACHE,'datasets') scov = SimplexCoverage(file_root, dataset_id, dataset_id, parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom) return scov
def _create_view_coverage(self, dataset_id, description, parent_dataset_id): # As annoying as it is we need to load the view coverage belonging to parent dataset id and use the information # inside to build the new one... file_root = FileSystem.get_url(FS.CACHE,'datasets') pscov = self._get_simplex_coverage(parent_dataset_id, mode='r') scov_location = pscov.persistence_dir pscov.close() vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov_location) return vcov
def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain,temporal_domain): file_root = FileSystem.get_url(FS.CACHE,'datasets') pdict = ParameterDictionary.load(parameter_dict) sdom = GridDomain.load(spatial_domain) tdom = GridDomain.load(temporal_domain) scov = self._create_simplex_coverage(dataset_id, pdict, sdom, tdom, self.inline_data_writes) vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov.persistence_dir) scov.close() return vcov
def process(self,packet): input = int(packet.get('num',0)) prep = 'echo \'1+%d\' | bc' %(input) output = commands.getoutput(prep) if self.has_output: self.publish(dict(num=output)) with open(FileSystem.get_url(FS.TEMP,"transform_output"),'a') as f: f.write('(%s): Received %s, transform: %s\n' %(self.name, packet, output))
def upload_qc(): upload_folder = FileSystem.get_url(FS.TEMP,'uploads') try: object_store = Container.instance.object_store # required fields upload = request.files['file'] # <input type=file name="file"> if upload: # upload file - run filename through werkzeug.secure_filename filename = secure_filename(upload.filename) path = os.path.join(upload_folder, filename) upload_time = time.time() upload.save(path) filetype = _check_magic(upload) or 'CSV' # Either going to be ZIP or CSV, probably # register upload file_upload_context = { 'name': 'User uploaded QC file %s' % filename, 'filename': filename, 'filetype': filetype, # only CSV, no detection necessary 'path': path, 'upload_time': upload_time, 'status': 'File uploaded to server' } fuc_id, _ = object_store.create_doc(file_upload_context) # client to process dispatch pd_client = ProcessDispatcherServiceClient() # create process definition process_definition = ProcessDefinition( name='upload_qc_processor', executable={ 'module': 'ion.processes.data.upload.upload_qc_processing', 'class': 'UploadQcProcessing' } ) process_definition_id = pd_client.create_process_definition(process_definition) # create process process_id = pd_client.create_process(process_definition_id) # schedule process config = DotDict() config.process.fuc_id = fuc_id pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config) log.info('UploadQcProcessing process created %s' % pid) # response - only FileUploadContext ID and determined filetype for UX display resp = {'fuc_id': fuc_id} return gateway_json_response(resp) raise BadRequest('Invalid Upload') except Exception as e: return build_error_response(e)
def _create_complex_coverage(cls, dataset_id, description, parameter_dict): pdict = ParameterDictionary.load(parameter_dict) file_root = FileSystem.get_url(FS.CACHE, 'datasets') ccov = ComplexCoverage( file_root, dataset_id, 'Complex Coverage for %s' % dataset_id, parameter_dictionary=pdict, complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) return ccov
def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain): file_root = FileSystem.get_url(FS.CACHE, 'datasets') scov = SimplexCoverage(file_root, dataset_id, 'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain) return scov
def process(self, packet): input = int(packet.get('num', 0)) prep = 'echo \'1+%d\' | bc' % (input) output = commands.getoutput(prep) if self.has_output: self.publish(dict(num=output)) with open(FileSystem.get_url(FS.TEMP, "transform_output"), 'a') as f: f.write('(%s): Received %s, transform: %s\n' % (self.name, packet, output))
def check_msg(msg, header): assertions(isinstance(msg, StreamGranuleContainer), 'Msg is not a container') hdf_string = msg.identifiables[msg.data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() log.debug('Sha1 matches') log.debug('Dumping file so you can inspect it.') log.debug('Records: %d' % msg.identifiables['record_count'].value) with open(FileSystem.get_url(FS.TEMP,'%s.cap.hdf5' % sha1[:8]),'w') as f: f.write(hdf_string) log.debug('Stream Capture: %s', f.name) result.set(True)
def process(self, packet): """Processes incoming data!!!! """ output = int(packet.get('num',0)) + 1 log.debug('(%s) Processing Packet: %s',self.name,packet) log.debug('(%s) Transform Complete: %s', self.name, output) if self.has_output: self.publish(dict(num=str(output))) with open(FileSystem.get_url(FS.TEMP,"transform_output"),'a') as f: f.write('(%s): Received Packet: %s\n' % (self.name,packet)) f.write('(%s): - Transform - %d\n' % (self.name,output))
def _create_view_coverage(self, dataset_id, description, parent_dataset_id): # As annoying as it is we need to load the view coverage belonging to parent dataset id and use the information # inside to build the new one... file_root = FileSystem.get_url(FS.CACHE, 'datasets') pscov = self._get_simplex_coverage(parent_dataset_id, mode='r') scov_location = pscov.persistence_dir pscov.close() vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov_location) return vcov
def process(self, packet): """Processes incoming data!!!! """ output = int(packet.get('num', 0)) + 1 log.debug('(%s) Processing Packet: %s', self.name, packet) log.debug('(%s) Transform Complete: %s', self.name, output) if self.has_output: self.publish(dict(num=str(output))) with open(FileSystem.get_url(FS.TEMP, "transform_output"), 'a') as f: f.write('(%s): Received Packet: %s\n' % (self.name, packet)) f.write('(%s): - Transform - %d\n' % (self.name, output))
def check_msg(msg, header): assertions(isinstance(msg, StreamGranuleContainer), 'Msg is not a container') hdf_string = msg.identifiables[msg.data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() log.debug('Sha1 matches') log.debug('Dumping file so you can inspect it.') log.debug('Records: %d' % msg.identifiables['record_count'].value) with open(FileSystem.get_url(FS.TEMP, '%s.cap.hdf5' % sha1[:8]), 'w') as f: f.write(hdf_string) log.debug('Stream Capture: %s', f.name) result.set(True)
def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain, temporal_domain): pdict = ParameterDictionary.load(parameter_dict) sdom = GridDomain.load(spatial_domain) tdom = GridDomain.load(temporal_domain) file_root = FileSystem.get_url(FS.CACHE, 'datasets') scov = SimplexCoverage(file_root, dataset_id, description or dataset_id, parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom, inline_data_writes=self.inline_data_writes) return scov
def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain, temporal_domain): file_root = FileSystem.get_url(FS.CACHE, 'datasets') pdict = ParameterDictionary.load(parameter_dict) sdom = GridDomain.load(spatial_domain) tdom = GridDomain.load(temporal_domain) scov = self._create_simplex_coverage(dataset_id, pdict, sdom, tdom, self.inline_data_writes) vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov.persistence_dir) scov.close() return vcov
def setUp(self): import numpy, h5py FileSystem(DotDict()) #-------------------------------------------------------------------- # Create an hdf file for testing #-------------------------------------------------------------------- self.salinity = [0,]*3 self.temperature = [0,]*3 self.pressure = [0,]*3 self.salinity[0] = numpy.arange(50) self.salinity[1] = numpy.arange(50) + 50 self.salinity[2] = numpy.arange(50) + 100 self.temperature[0] = numpy.random.normal(size=50) self.temperature[1] = numpy.random.normal(size=50) self.temperature[2] = numpy.random.normal(size=50) self.pressure[0] = numpy.random.uniform(low=0.0, high=1.0, size=50) self.pressure[1] = numpy.random.uniform(low=0.0, high=1.0, size=50) self.pressure[2] = numpy.random.uniform(low=0.0, high=1.0, size=50) # provide the check_pieces mathod the size of the dataset so that it can do its checking.. self.sl = slice(0,150) self.fnames = [0,]*3 for i in range(0,3): self.fnames[i] = FileSystem.get_url(FS.TEMP, 'data%d.hdf5' % (i+1)) for fname, s, t, p in zip(self.fnames, self.salinity, self.temperature, self.pressure): file = h5py.File(fname, 'w') grp1 = file.create_group('fields') dset1 = grp1.create_dataset("salinity", data=s) dset2 = grp1.create_dataset("temperature", data=t) dset3 = grp1.create_dataset("pressure", data=p) file.close() # Concatenate the test values for comparison: self.t_result = numpy.concatenate((self.temperature[0],self.temperature[1],self.temperature[2]), axis = 0) self.s_result = numpy.concatenate((self.salinity[0],self.salinity[1],self.salinity[2]), axis = 0) self.p_result = numpy.concatenate((self.pressure[0],self.pressure[1],self.pressure[2]), axis = 0)
def __init__(self, hdf_string): """ @param hdf_string """ #try: assert isinstance(hdf_string, basestring), 'The input for instantiating the HDFDecoder object is not a string' #except AssertionError as err: # raise HDFDecoderException(err.message) self.filename = FileSystem.get_url(fs=FS.TEMP, filename=hashlib.sha1(hdf_string).hexdigest(), ext='_decoder.hdf5') #try: # save an hdf string to disk - in /tmp to so we can open it as an hdf file and read data from it f = open(self.filename, mode='wb') f.write(hdf_string) f.close()
def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain, inline_data_writes=True): file_root = FileSystem.get_url(FS.CACHE, 'datasets') scov = SimplexCoverage(file_root, uuid4().hex, 'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain, inline_data_writes=inline_data_writes) return scov
def _create_coverage(self, dataset_id, parameter_dict_id, time_dom, spatial_dom): pd = self.dataset_management_client.read_parameter_dictionary( parameter_dict_id) pdict = ParameterDictionary.load(pd) sdom = GridDomain.load(spatial_dom.dump()) tdom = GridDomain.load(time_dom.dump()) file_root = FileSystem.get_url(FS.CACHE, 'datasets') scov = SimplexCoverage(file_root, dataset_id, dataset_id, parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom) return scov
def __init__(self, name = None): """ @param name The name of the dataset """ # generate a random name for the filename if it has not been provided. self.filename = FileSystem.get_url(fs=FS.TEMP, filename=name or random_name(), ext='encoder.hdf5') # Using inline imports to put off making hdf/numpy required dependencies import h5py # open an hdf file on disk - in /tmp to write data to since we can't yet do in memory log.debug("Creating h5py file object for the encoder at %s" % self.filename) if os.path.isfile(self.filename): # if file exists, then append to it self.h5pyfile = h5py.File(self.filename, mode = 'r+', driver='core') else: # if file does not already exist, write a new one self.h5pyfile = h5py.File(self.filename, mode = 'w', driver='core') assert self.h5pyfile, 'No h5py file object created.'
def upload_data(dataproduct_id): upload_folder = FileSystem.get_url(FS.TEMP,'uploads') try: rr_client = ResourceRegistryServiceProcessClient(process=service_gateway_instance) object_store = Container.instance.object_store try: rr_client.read(str(dataproduct_id)) except BadRequest: raise BadRequest('Unknown DataProduct ID %s' % dataproduct_id) # required fields upload = request.files['file'] # <input type=file name="file"> # determine filetype filetype = _check_magic(upload) upload.seek(0) # return to beginning for save if upload and filetype is not None: # upload file - run filename through werkzeug.secure_filename filename = secure_filename(upload.filename) path = os.path.join(upload_folder, filename) upload_time = time.time() upload.save(path) # register upload file_upload_context = { # TODO add dataproduct_id 'name':'User uploaded file %s' % filename, 'filename':filename, 'filetype':filetype, 'path':path, 'upload_time':upload_time, 'status':'File uploaded to server' } fuc_id, _ = object_store.create_doc(file_upload_context) # client to process dispatch pd_client = ProcessDispatcherServiceClient() # create process definition process_definition = ProcessDefinition( name='upload_data_processor', executable={ 'module':'ion.processes.data.upload.upload_data_processing', 'class':'UploadDataProcessing' } ) process_definition_id = pd_client.create_process_definition(process_definition) # create process process_id = pd_client.create_process(process_definition_id) #schedule process config = DotDict() config.process.fuc_id = fuc_id config.process.dp_id = dataproduct_id pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config) log.info('UploadDataProcessing process created %s' % pid) # response - only FileUploadContext ID and determined filetype for UX display resp = {'fuc_id': fuc_id} return gateway_json_response(resp) raise BadRequest('Invalid Upload') except Exception as e: return build_error_response(e)
def _get_coverage(cls, dataset_id, mode='w'): file_root = FileSystem.get_url(FS.CACHE, 'datasets') coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode) return coverage
def _get_coverage_path(cls, dataset_id): file_root = FileSystem.get_url(FS.CACHE, 'datasets') return os.path.join(file_root, '%s' % dataset_id)
def _publish_query(self, results): ''' Callback to publish the specified results ''' #----------------------- # Iteration #----------------------- # - Go through the results, if the user had include_docs=True in the options field # then the full document is in result.doc; however if the query did not include_docs, # then only the doc_id is provided in the result.value. # # - What this allows us to do is limit the amount of traffic in information for large queries. # If we only are making a query in a sequence of queries (such as map and reduce) then we don't # care about the full document, yet, we only care about the doc id and will retrieve the document later. # - Example: # Imagine the blogging example, we want the latest blog by author George and all the comments for that blog # The series of queries would go, post_by_updated -> posts_by_author -> posts_join_comments and then # in the last query we'll set include_docs to true and parse the docs. #----------------------- log.warn('results: %s', results) for result in results: log.warn('REPLAY Result: %s' % result) assert('doc' in result) replay_obj_msg = result['doc'] if isinstance(replay_obj_msg, BlogBase): replay_obj_msg.is_replay = True self.lock.acquire() self.output.publish(replay_obj_msg) self.lock.release() elif isinstance(replay_obj_msg, StreamDefinitionContainer): replay_obj_msg.stream_resource_id = self.stream_id elif isinstance(replay_obj_msg, StreamGranuleContainer): # Override the resource_stream_id so ingestion doesn't reingest, also this is a NEW stream (replay) replay_obj_msg.stream_resource_id = self.stream_id datastream = None sha1 = None for key, identifiable in replay_obj_msg.identifiables.iteritems(): if isinstance(identifiable, DataStream): datastream = identifiable elif isinstance(identifiable, Encoding): sha1 = identifiable.sha1 if sha1: # if there is an encoding # Get the file from disk filename = FileSystem.get_url(FS.CACHE, sha1, ".hdf5") log.warn('Replay reading from filename: %s' % filename) hdf_string = '' try: with open(filename, mode='rb') as f: hdf_string = f.read() f.close() # Check the Sha1 retreived_hdfstring_sha1 = hashlib.sha1(hdf_string).hexdigest().upper() if sha1 != retreived_hdfstring_sha1: raise ReplayProcessException('The sha1 mismatch between the sha1 in datastream and the sha1 of hdf_string in the saved file in hdf storage') except IOError: log.warn('No HDF file found!') #@todo deal with this situation? How? hdf_string = 'HDF File %s not found!' % filename # set the datastream.value field! datastream.values = hdf_string else: log.warn('No encoding in the StreamGranuleContainer!') self.lock.acquire() self.output.publish(replay_obj_msg) self.lock.release() else: log.warn('Unknown type retrieved in DOC!') #@todo: log when there are not results if results is None: log.warn('No results found in replay query!') else: log.debug('Published replay!')
def start(self): if self.container.has_capability(self.container.CCAP.FILE_SYSTEM): self.datastore_dir = FileSystem.get_url(FS.FILESTORE, self.datastore_name) else: self.datastore_dir = "./tmp/%s" % self.datastore_name
def upload_data(dataproduct_id): upload_folder = FileSystem.get_url(FS.TEMP, 'uploads') try: rr_client = ResourceRegistryServiceProcessClient( node=Container.instance.node, process=service_gateway_instance) object_store = Container.instance.object_store try: rr_client.read(str(dataproduct_id)) except BadRequest: raise BadRequest('Unknown DataProduct ID %s' % dataproduct_id) # required fields upload = request.files['file'] # <input type=file name="file"> # determine filetype filetype = _check_magic(upload) upload.seek(0) # return to beginning for save if upload and filetype is not None: # upload file - run filename through werkzeug.secure_filename filename = secure_filename(upload.filename) path = os.path.join(upload_folder, filename) upload_time = time.time() upload.save(path) # register upload file_upload_context = { # TODO add dataproduct_id 'name': 'User uploaded file %s' % filename, 'filename': filename, 'filetype': filetype, 'path': path, 'upload_time': upload_time, 'status': 'File uploaded to server' } fuc_id, _ = object_store.create_doc(file_upload_context) # client to process dispatch pd_client = ProcessDispatcherServiceClient() # create process definition process_definition = ProcessDefinition( name='upload_data_processor', executable={ 'module': 'ion.processes.data.upload.upload_data_processing', 'class': 'UploadDataProcessing' }) process_definition_id = pd_client.create_process_definition( process_definition) # create process process_id = pd_client.create_process(process_definition_id) #schedule process config = DotDict() config.process.fuc_id = fuc_id config.process.dp_id = dataproduct_id pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config) log.info('UploadDataProcessing process created %s' % pid) # response - only FileUploadContext ID and determined filetype for UX display resp = {'fuc_id': fuc_id} return gateway_json_response(resp) raise BadRequest('Invalid Upload') except Exception as e: return build_error_response(e)
def process_stream(self, packet, dset_config): """ Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the stream such as store in hfd_storage, couch_storage. @param: packet The incoming data stream of type stream. @param: dset_config The dset_config telling this method what to do with the incoming data stream. """ # Ignoring is_replay attribute now that we have a policy implementation if isinstance(packet, StreamGranuleContainer): if dset_config is None: log.warn('No dataset config for this stream!') return hdfstring = '' sha1 = '' for key,value in packet.identifiables.iteritems(): if isinstance(value, DataStream): hdfstring = value.values value.values='' elif isinstance(value, Encoding): sha1 = value.sha1 if dset_config.archive_metadata is True: log.debug("Persisting data....") self.persist_immutable(packet ) if dset_config.archive_data is True: #@todo - grab the filepath to save the hdf string somewhere.. if hdfstring: calculated_sha1 = hashlib.sha1(hdfstring).hexdigest().upper() filename = FileSystem.get_url(FS.CACHE, calculated_sha1, ".hdf5") if sha1 != calculated_sha1: raise IngestionWorkerException('The sha1 stored is different than the calculated from the received hdf_string') log.warn('writing to filename: %s' % filename) with open(filename, mode='wb') as f: f.write(hdfstring) f.close() else: log.warn("Nothing to write!") elif isinstance(packet, BlogPost) and not packet.is_replay: self.persist_immutable(packet ) elif isinstance(packet, BlogComment) and not packet.is_replay: self.persist_immutable(packet)
def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain): file_root = FileSystem.get_url(FS.CACHE,'datasets') scov = SimplexCoverage(file_root,dataset_id,'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain ) return scov
def _get_coverage_path(cls, dataset_id): file_root = FileSystem.get_url(FS.CACHE,'datasets') return os.path.join(file_root, '%s' % dataset_id)
def _get_coverage(cls, dataset_id, mode='w'): file_root = FileSystem.get_url(FS.CACHE, 'datasets') coverage = SimplexCoverage(file_root, dataset_id, mode=mode) return coverage
def _get_coverage(cls,dataset_id,mode='r'): file_root = FileSystem.get_url(FS.CACHE,'datasets') coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode) return coverage
def on_start(self): super(TransformCapture, self).on_start() # #@todo: Remove debugging statements log.debug('(Transform: %s) Starting...',self.name) self.file_name = self.CFG.get_safe('process.file_name',FileSystem.get_url(FS.TEMP,'transform_output'))
def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain, inline_data_writes=True): file_root = FileSystem.get_url(FS.CACHE,'datasets') scov = SimplexCoverage(file_root,uuid4().hex,'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain, inline_data_writes=inline_data_writes) return scov
def setUp(self): import numpy, h5py FileSystem(DotDict()) #-------------------------------------------------------------------- # Create an hdf file for testing #-------------------------------------------------------------------- self.salinity = [ 0, ] * 3 self.temperature = [ 0, ] * 3 self.pressure = [ 0, ] * 3 self.salinity[0] = numpy.arange(50) self.salinity[1] = numpy.arange(50) + 50 self.salinity[2] = numpy.arange(50) + 100 self.temperature[0] = numpy.random.normal(size=50) self.temperature[1] = numpy.random.normal(size=50) self.temperature[2] = numpy.random.normal(size=50) self.pressure[0] = numpy.random.uniform(low=0.0, high=1.0, size=50) self.pressure[1] = numpy.random.uniform(low=0.0, high=1.0, size=50) self.pressure[2] = numpy.random.uniform(low=0.0, high=1.0, size=50) # provide the check_pieces mathod the size of the dataset so that it can do its checking.. self.sl = slice(0, 150) self.fnames = [ 0, ] * 3 for i in range(0, 3): self.fnames[i] = FileSystem.get_url(FS.TEMP, 'data%d.hdf5' % (i + 1)) for fname, s, t, p in zip(self.fnames, self.salinity, self.temperature, self.pressure): file = h5py.File(fname, 'w') grp1 = file.create_group('fields') dset1 = grp1.create_dataset("salinity", data=s) dset2 = grp1.create_dataset("temperature", data=t) dset3 = grp1.create_dataset("pressure", data=p) file.close() # Concatenate the test values for comparison: self.t_result = numpy.concatenate( (self.temperature[0], self.temperature[1], self.temperature[2]), axis=0) self.s_result = numpy.concatenate( (self.salinity[0], self.salinity[1], self.salinity[2]), axis=0) self.p_result = numpy.concatenate( (self.pressure[0], self.pressure[1], self.pressure[2]), axis=0)
def _create_complex_coverage(cls, dataset_id, description, parameter_dict): pdict = ParameterDictionary.load(parameter_dict) file_root = FileSystem.get_url(FS.CACHE, 'datasets') ccov = ComplexCoverage(file_root, dataset_id, 'Complex Coverage for %s' % dataset_id, parameter_dictionary=pdict, complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) return ccov
def test_dm_integration(self): ''' test_dm_integration Test full DM Services Integration ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here #----------------------------- pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) process_list = [] datasets = [] datastore_name = 'test_dm_integration' #--------------------------- # Set up ingestion #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=8 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) ctd_stream_def = ctd_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition') #--------------------------- # Set up the producers (CTD Simulators) #--------------------------- # Launch five simulated CTD producers for iteration in xrange(5): # Make a stream to output on stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #--------------------------- # Set up the datasets #--------------------------- dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Keep track of the datasets datasets.append(dataset_id) stream_policy_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'ion.processes.data.ctd_stream_publisher', 'class':'SimpleCtdPublisher' } configuration = { 'process':{ 'stream_id':stream_id, 'datastore_name':datastore_name } } procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id) pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration) # Keep track, we'll kill 'em later. process_list.append(pid) # Get about 4 seconds of data time.sleep(4) #--------------------------- # Stop producing data #--------------------------- for process in process_list: process_dispatcher.cancel_process(process) #---------------------------------------------- # The replay and the transform, a love story. #---------------------------------------------- # Happy Valentines to the clever coder who catches the above! transform_definition = ProcessDefinition() transform_definition.executable = { 'module':'ion.processes.data.transforms.transform_example', 'class':'TransformCapture' } transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition) dataset_id = datasets.pop() # Just need one for now replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id) #-------------------------------------------- # I'm Selling magazine subscriptions here! #-------------------------------------------- subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]), exchange_name='transform_capture_point') #-------------------------------------------- # Start the transform (capture) #-------------------------------------------- transform_id = transform_management_service.create_transform( name='capture_transform', in_subscription_id=subscription, process_definition_id=transform_definition_id ) transform_management_service.activate_transform(transform_id=transform_id) #-------------------------------------------- # BEGIN REPLAY! #-------------------------------------------- data_retriever_service.start_replay(replay_id=replay_id) #-------------------------------------------- # Lets get some boundaries #-------------------------------------------- bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id) assertions('latitude_bounds' in bounds, 'dataset_id: %s' % dataset_id) assertions('longitude_bounds' in bounds) assertions('pressure_bounds' in bounds) #-------------------------------------------- # Make sure the transform capture worked #-------------------------------------------- time.sleep(3) # Give the other processes up to 3 seconds to catch up stats = os.stat(FileSystem.get_url(FS.TEMP,'transform_output')) assertions(stats.st_blksize > 0) # BEAUTIFUL! FileSystem.unlink(FileSystem.get_url(FS.TEMP,'transform_output'))
def _get_coverage(cls,dataset_id,mode='w'): file_root = FileSystem.get_url(FS.CACHE,'datasets') coverage = SimplexCoverage(file_root, dataset_id,mode=mode) return coverage