def generate_release(self): ''' Get a list of data set to be processed and try to harmonised them into one big data cube ''' # Prepare a task list tasks = [] for sheet_name in self._get_sheets_list(): output_file = self._conf.get_path('release') + sheet_name + '.ttl' task = { 'sheet_name': sheet_name, 'output_file': output_file, 'endpoint': self._conf.get_SPARQL(), 'compress': self._conf.isCompress(), 'target': self._conf.get_namespace('data'), 'release_graph': self._conf.get_graph_name('release'), 'raw_data_graph': self._conf.get_graph_name('raw-data'), 'rules_graph': self._conf.get_graph_name('rules'), 'measure': self._conf.get_measure() } tasks.append(task) # Call cube in parallel, avoid hammering the store too much cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=min(4, cpu_count)) pool.map(generate_release_thread, tasks) pool.close() pool.join() # Push all the data to the triple store self._push_to_graph(self._conf.get_graph_name('release'), self._conf.get_path('release')) # Create an instance of CubeMaker cubeMaker = CubeMaker(self._conf.get_SPARQL(), self._conf.get_graph_name('release'), self._conf.get_graph_name('raw-data'), self._conf.get_graph_name('rules')) cubeMaker.set_target_namespace(self._conf.get_namespace('data')) cubeMaker.set_compress(self._conf.isCompress()) # Update the DSD dsd_file_name = self._conf.get_path('release') + 'dsd.ttl' log.info("Asking CubeMaker to generate the DSD") cubeMaker.generate_dsd(self._conf.get_cube_title(), self._conf.get_measure(), self._conf.get_measureunit(), self._conf.get_slices(), dsd_file_name) # Load the DSD pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(), self._conf.get_secret()) log.info("[{}] Adding the content of the DSD".format( self._conf.get_graph_name('release'))) if self._conf.isCompress(): dsd_file_name = dsd_file_name + ".bz2" pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
def generate_release(self): ''' Get a list of data set to be processed and try to harmonised them into one big data cube ''' # Prepare a task list tasks = [] for sheet_name in self._get_sheets_list(): output_file = self._conf.get_path('release') + sheet_name + '.ttl' task = {'sheet_name' : sheet_name, 'output_file' : output_file, 'endpoint' : self._conf.get_SPARQL(), 'compress' : self._conf.isCompress(), 'target' : self._conf.get_namespace('data'), 'release_graph' : self._conf.get_graph_name('release'), 'raw_data_graph' : self._conf.get_graph_name('raw-data'), 'rules_graph' : self._conf.get_graph_name('rules'), 'measure' : self._conf.get_measure()} tasks.append(task) # Call cube in parallel, avoid hammering the store too much cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=min(4, cpu_count)) pool.map(generate_release_thread, tasks) pool.close() pool.join() # Push all the data to the triple store self._push_to_graph(self._conf.get_graph_name('release'), self._conf.get_path('release')) # Create an instance of CubeMaker cubeMaker = CubeMaker(self._conf.get_SPARQL(), self._conf.get_graph_name('release'), self._conf.get_graph_name('raw-data'), self._conf.get_graph_name('rules')) cubeMaker.set_target_namespace(self._conf.get_namespace('data')) cubeMaker.set_compress(self._conf.isCompress()) # Update the DSD dsd_file_name = self._conf.get_path('release') + 'dsd.ttl' log.info("Asking CubeMaker to generate the DSD") cubeMaker.generate_dsd(self._conf.get_cube_title(), self._conf.get_measure(), self._conf.get_measureunit(), self._conf.get_slices(), dsd_file_name) # Load the DSD pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(), self._conf.get_secret()) log.info("[{}] Adding the content of the DSD".format(self._conf.get_graph_name('release'))) if self._conf.isCompress(): dsd_file_name = dsd_file_name + ".bz2" pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
def _push_to_graph(self, named_graph, directory): ''' Push data to to the triple store ''' pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(), self._conf.get_secret()) log.info("[{}] Cleaning the content of the graph ".format(named_graph)) pusher.clean_graph(named_graph) log.info("[{}] Loading files in {}".format(named_graph, directory)) for input_file in sorted(glob.glob(directory + '/*')): log.info("[{}] Loading {}".format(named_graph, input_file)) pusher.upload_file(named_graph, input_file) log.info("[{}] Done loading data".format(named_graph))