def register_directory(self, dirpath, **kwargs): """ Registers all of the files in the the directory path """ kwargs['file_extensions'] = kwargs.get("file_extensions", self.rdf_formats) files = list_files(file_directory=dirpath, **kwargs) for fileinfo in files: self.register_rml(fileinfo[-1], **kwargs)
def load_directory(self, directory, **kwargs): """ loads all rdf files in a directory args: directory: full path to the directory """ log.setLevel(kwargs.get("log_level", self.log_level)) conn = self.__get_conn__(**kwargs) file_extensions = kwargs.get('file_extensions', conn.rdf_formats) file_list = list_files(directory, file_extensions, kwargs.get('include_subfolders', False), include_root=True) for file in file_list: self.load_file(file[1], **kwargs) log.setLevel(self.log_level)
def bulk_load(self, **kwargs): """ Uploads data to the Blazegraph Triplestore that is stored in files that are in a local directory kwargs: file_directory: a string path to the file directory file_extensions: a list of file extensions to filter example ['xml', 'rdf']. If none include all files include_subfolders: as implied namespace: the Blazegraph namespace to load the data graph: uri of the graph to load the data. Default is None create_namespace: False(default) or True will create the namespace if it does not exist """ namespace = kwargs.get('namespace', self.namespace) graph = kwargs.get('graph', self.graph) if kwargs.get('reset') == True: self.reset_namespace() file_directory = kwargs.get('file_directory', self.local_directory) file_extensions = kwargs.get('file_extensions', self.rdf_formats) root_dir = kwargs.get('root_dir', self.local_directory) file_list = list_files(file_directory, file_extensions, kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) path_parts = [' '] if self.container_dir: path_parts.append(self.container_dir) file_or_dirs = ",".join([ os.path.join(os.path.join(*path_parts), file[1]) for file in file_list ]).strip() file_or_dirs = "/alliance_data" _params = BULK_LOADER_PARAMS.copy() params = { 'namespace': kwargs.get('namespace', self.namespace), 'file_or_dirs': file_or_dirs, } _params.update(params) time_start = datetime.datetime.now() log.info(" starting load of '%s' files into namespace '%s'", len(file_list), params['namespace']) new_params = {key: json.dumps(value) \ for key, value in _params.items() \ if not isinstance(value, str)} new_params.update({key: value \ for key, value in _params.items() \ if isinstance(value, str)}) data = BULK_LOADER_XML.format(**new_params) # data = BULK_LOADER_XML2 print(data) pdb.set_trace() url = os.path.join(self.url, 'dataloader') result = requests.post(url=url, headers={"Content-Type": 'application/xml'}, data=data) failed_list = list_files(file_directory, ['fail'], kwargs.get('include_subfolders', True), include_root=kwargs.get( 'include_root', False), root_dir=root_dir) failed_list = [file for file in failed_list \ if file[0].split(".")[-2] in file_extensions] good_list = list_files(file_directory, ['good'], kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) # pdb.set_trace() log.info(" bulk_load results: %s\nThe following files successfully loaded: \n\t%s", result.text, "\n\t".join([os.path.splitext(file[1])[0] \ for file in good_list])) if failed_list: log.warning("The following files failed to load:\n\t%s", "\n\t".join([file[1] for file in failed_list])) log.info(" Attempting load via alt method ***") for file in failed_list: os.rename(os.path.join(root_dir, file[1]), os.path.join(root_dir, os.path.splitext(file[1])[0])) self.load_local_file( os.path.splitext(file[1])[0], namespace, graph) # restore file names files = list_files(file_directory, ['good', 'fail'], kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) [os.rename(os.path.join(root_dir, file[1]), os.path.join(root_dir, os.path.splitext(file[1])[0])) \ for file in files]
def load_directory(self, method='data_stream', **kwargs): """ Uploads data to the Blazegraph Triplestore that is stored in files that are in a local directory kwargs: method['local', 'data_stream']: 'local' uses the container dir 'data_stream': reads the file and sends it as part of http request file_directory: a string path to the file directory to start the search container_dir: the path that the triplestore container sees root_dir: root directory to be removed from the file paths for example: file_directory: this is as seen from python app /example/python/data/dir/to/search container_dir: this is the path as seen from the triplestore /data root_dir: the portion of the path to remove so both directories match /example/python/data file_extensions: a list of file extensions to filter example ['xml', 'rdf']. If none include all files include_subfolders: as implied namespace: the Blazegraph namespace to load the data graph: uri of the graph to load the data. Default is None create_namespace: False(default) or True will create the namespace if it does not exist use_threading(bool): Whether to use threading or not """ if kwargs.get('reset') == True: self.reset_namespace() namespace = kwargs.get('namespace', self.namespace) container_dir = kwargs.get('container_dir', self.container_dir) graph = kwargs.get('graph') time_start = datetime.datetime.now() include_root = kwargs.get('include_root', False) if method == 'data_stream': include_root = True file_directory = kwargs.get('file_directory', self.local_directory) file_extensions = kwargs.get('file_extensions', self.rdf_formats) file_list = list_files(file_directory, file_extensions, kwargs.get('include_subfolders', True), include_root=include_root, root_dir=kwargs.get('root_dir', self.local_directory)) log.info(" starting load of '%s' files into namespace '%s'", len(file_list), self.namespace) if kwargs.get('create_namespace') and namespace: if not self.has_namespace(namespace): self.create_namespace(namespace) if not self.has_namespace(namespace): msg = "".join([ "Namespace '%s' does not exist. " % namespace, "Pass kwarg 'create_namespace=True' to ", "auto-create the namespace." ]) raise ValueError(msg) params = {} for file in file_list: if kwargs.get('use_threading') == True: if method == 'data_stream': th = threading.Thread(name=file[1], target=self.load_data, args=( file[1], None, namespace, graph, True, )) else: th = threading.Thread(name=file[1], target=self.load_local_file, args=( file[1], namespace, graph, )) th.start() else: if method == 'data_stream': self.load_data(data=file[1], namespace=namespace, graph=graph, is_file=True) else: self.load_local_file(file[1], namespace, graph) if kwargs.get('use_threading') == True: main_thread = threading.main_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() log.info("%s file(s) loaded in: %s", len(file_list), datetime.datetime.now() - time_start)
def bulk_load(self, **kwargs): """ Uploads data to the Blazegraph Triplestore that is stored in files that are in a local directory kwargs: file_directory: a string path to the file directory file_extensions: a list of file extensions to filter example ['xml', 'rdf']. If none include all files include_subfolders: as implied namespace: the Blazegraph namespace to load the data graph: uri of the graph to load the data. Default is None create_namespace: False(default) or True will create the namespace if it does not exist """ namespace = kwargs.get('namespace', self.namespace) graph = kwargs.get('graph', self.graph) if kwargs.get('reset') == True: self.reset_namespace() file_directory = kwargs.get('file_directory', self.local_directory) file_extensions = kwargs.get('file_extensions', self.rdf_formats) root_dir = kwargs.get('root_dir', self.local_directory) file_list = list_files(file_directory, file_extensions, kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) path_parts = [' '] if self.container_dir: path_parts.append(self.container_dir) file_or_dirs = ",".join([os.path.join(os.path.join(*path_parts),file[1]) for file in file_list]).strip() file_or_dirs = "/alliance_data" _params = BULK_LOADER_PARAMS.copy() params = { 'namespace': kwargs.get('namespace', self.namespace), 'file_or_dirs': file_or_dirs, } _params.update(params) time_start = datetime.datetime.now() log.info(" starting load of '%s' files into namespace '%s'", len(file_list), params['namespace']) new_params = {key: json.dumps(value) \ for key, value in _params.items() \ if not isinstance(value, str)} new_params.update({key: value \ for key, value in _params.items() \ if isinstance(value, str)}) data = BULK_LOADER_XML.format(**new_params) # data = BULK_LOADER_XML2 print(data) pdb.set_trace() url = os.path.join(self.url, 'dataloader') result = requests.post(url=url, headers={"Content-Type": 'application/xml'}, data=data) failed_list = list_files(file_directory, ['fail'], kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) failed_list = [file for file in failed_list \ if file[0].split(".")[-2] in file_extensions] good_list = list_files(file_directory, ['good'], kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) # pdb.set_trace() log.info(" bulk_load results: %s\nThe following files successfully loaded: \n\t%s", result.text, "\n\t".join([os.path.splitext(file[1])[0] \ for file in good_list])) if failed_list: log.warning("The following files failed to load:\n\t%s", "\n\t".join([file[1] for file in failed_list])) log.info(" Attempting load via alt method ***") for file in failed_list: os.rename(os.path.join(root_dir, file[1]), os.path.join(root_dir, os.path.splitext(file[1])[0])) self.load_local_file(os.path.splitext(file[1])[0], namespace, graph) # restore file names files = list_files(file_directory, ['good','fail'], kwargs.get('include_subfolders', True), include_root=kwargs.get('include_root', False), root_dir=root_dir) [os.rename(os.path.join(root_dir, file[1]), os.path.join(root_dir, os.path.splitext(file[1])[0])) \ for file in files]
def load_directory(self, method='data_stream', **kwargs): """ Uploads data to the Blazegraph Triplestore that is stored in files that are in a local directory kwargs: method['local', 'data_stream']: 'local' uses the container dir 'data_stream': reads the file and sends it as part of http request file_directory: a string path to the file directory to start the search container_dir: the path that the triplestore container sees root_dir: root directory to be removed from the file paths for example: file_directory: this is as seen from python app /example/python/data/dir/to/search container_dir: this is the path as seen from the triplestore /data root_dir: the portion of the path to remove so both directories match /example/python/data file_extensions: a list of file extensions to filter example ['xml', 'rdf']. If none include all files include_subfolders: as implied namespace: the Blazegraph namespace to load the data graph: uri of the graph to load the data. Default is None create_namespace: False(default) or True will create the namespace if it does not exist use_threading(bool): Whether to use threading or not """ if kwargs.get('reset') == True: self.reset_namespace() namespace = kwargs.get('namespace', self.namespace) container_dir = kwargs.get('container_dir', self.container_dir) graph = kwargs.get('graph') time_start = datetime.datetime.now() include_root = kwargs.get('include_root', False) if method == 'data_stream': include_root = True file_directory = kwargs.get('file_directory', self.local_directory) file_extensions = kwargs.get('file_extensions', self.rdf_formats) file_list = list_files(file_directory, file_extensions, kwargs.get('include_subfolders', True), include_root=include_root, root_dir=kwargs.get('root_dir', self.local_directory)) log.info(" starting load of '%s' files into namespace '%s'", len(file_list), self.namespace) if kwargs.get('create_namespace') and namespace: if not self.has_namespace(namespace): self.create_namespace(namespace) if not self.has_namespace(namespace): msg = "".join(["Namespace '%s' does not exist. " % namespace, "Pass kwarg 'create_namespace=True' to ", "auto-create the namespace."]) raise ValueError(msg) params = {} for file in file_list: if kwargs.get('use_threading') == True: if method == 'data_stream': th = threading.Thread(name=file[1], target=self.load_data, args=(file[1], None, namespace, graph, True,)) else: th = threading.Thread(name=file[1], target=self.load_local_file, args=(file[1], namespace, graph,)) th.start() else: if method == 'data_stream': self.load_data(data=file[1], namespace=namespace, graph=graph, is_file=True) else: self.load_local_file(file[1], namespace, graph) if kwargs.get('use_threading') == True: main_thread = threading.main_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() log.info("%s file(s) loaded in: %s", len(file_list), datetime.datetime.now() - time_start)