def __init__(self, url=None, local_directory=None, container_dir=None, **kwargs): self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH) self.ext_url = pick(url, self.default_url) self.local_url = pick(kwargs.get('local_url'), self.default_url) self.url = None self.active = kwargs.get('active', True) if not kwargs.get('delay_check'): self.check_status if self.url: kwargs['es_url'] = self.url else: kwargs['es_url'] = self.ext_url super(Elastic, self).__init__(**kwargs) self.container_dir = container_dir if self.ext_url is None: msg = ["A Elasticsearch url must be defined. Either pass 'url'", "or initialize the 'RdfConfigManager'"] raise AttributeError(" ".join(msg))
def __init__(self, url=None, namespace=None, namespace_params=None, local_directory=None, container_dir=None, graph=None, **kwargs): self.local_directory = pick(local_directory, CFG.dirs.data) self.ext_url = pick(url, self.default_url) self.local_url = pick(kwargs.get('local_url'), self.default_url) self.log_level = log.level log.setLevel(kwargs.get("log_level", log.level)) self.namespace = pick(namespace, self.default_ns) self.namespace_params = namespace_params self.container_dir = container_dir self.graph = pick(graph, self.default_graph) self.url = None self.active = kwargs.get('active', True) if self.ext_url is None: msg = [ "A Blazegraph url must be defined. Either pass 'url'", "or initialize the 'RdfConfigManager'" ] raise AttributeError(" ".join(msg)) if not kwargs.get('delay_check'): self.check_status self.__set_mgr__(**kwargs)
def __init__(self, url=None, local_directory=None, container_dir=None, **kwargs): self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH) self.ext_url = pick(url, self.default_url) self.local_url = pick(kwargs.get('local_url'), self.default_url) self.url = None self.active = kwargs.get('active', True) if not kwargs.get('delay_check'): self.check_status if self.url: kwargs['es_url'] = self.url else: kwargs['es_url'] = self.ext_url super(Elastic, self).__init__(**kwargs) self.container_dir = container_dir if self.ext_url is None: msg = [ "A Elasticsearch url must be defined. Either pass 'url'", "or initialize the 'RdfConfigManager'" ] raise AttributeError(" ".join(msg))
def create_namespace(self, namespace=None, params=None): """ Creates a namespace in the triplestore args: namespace: the name of the namspace to create params: Dictionary of Blazegraph paramaters. defaults are: {'axioms': 'com.bigdata.rdf.axioms.NoAxioms', 'geoSpatial': False, 'isolatableIndices': False, 'justify': False, 'quads': False, 'rdr': False, 'textIndex': False, 'truthMaintenance': False} """ namespace = pick(namespace, self.namespace) params = pick(params, self.namespace_params) if not namespace: raise ReferenceError("No 'namespace' specified") _params = { 'axioms': 'com.bigdata.rdf.axioms.NoAxioms', 'geoSpatial': False, 'isolatableIndices': False, 'justify': False, 'namespace': namespace, 'quads': True, 'rdr': False, 'textIndex': False, 'truthMaintenance': False } if params: _params.update(params) content_type = "text/plain" url = self._make_url("prepareProperties").replace("/sparql", "") params = ["%s=%s" % (map_val, json.dumps(_params[map_key]).replace("\"", "")) \ for map_key, map_val in self.ns_property_map.items()] params = "\n".join(params) result = requests.post(url=url, headers={"Content-Type": content_type}, data=params) data = result.text content_type = "application/xml" url = self._make_url("x").replace("/x/sparql", "") result = requests.post(url=url, headers={"Content-Type": content_type}, data=data) if result.status_code == 201: log.warning(result.text) return result.text else: raise RuntimeError(result.text)
def reset_namespace(self, namespace=None, params=None): """ Will delete and recreate specified namespace args: namespace(str): Namespace to reset params(dict): params used to reset the namespace """ namespace = pick(namespace, self.namespace) params = pick(params, self.namespace_params) log.warning(" Reseting namespace '%s' at host: %s", namespace, self.url) try: self.delete_namespace(namespace) except KeyError: pass self.create_namespace(namespace, params)
def load_data(self, data, datatype="ttl", namespace=None, graph=None, is_file=False, **kwargs): """ loads data via file stream from python to triplestore Args: data: The data or filepath to load datatype(['ttl', 'xml', 'rdf']): the type of data to load namespace: the namespace to use graph: the graph to load the data to. is_file(False): If true python will read the data argument as a filepath, determine the datatype from the file extension, read the file and send it to blazegraph as a datastream """ if kwargs.get('debug'): log.setLevel(logging.DEBUG) time_start = datetime.datetime.now() datatype_map = { 'ttl': 'turtle', 'xml': 'xml', 'rdf': 'xml', 'nt': 'nt', 'n3': 'n3', 'nquads': 'nquads', 'hturtle': 'hturtle' } if is_file: datatype = data.split(os.path.extsep)[-1] file_name = data log.debug('starting data load of %s', file_name) data = open(data, 'rb').read() try: content_type = datatype_map[datatype] except KeyError: raise NotImplementedError("'%s' is not an implemented data fromat", datatype) conn = self.conn if namespace: conn = self.tstore.get_namespace(namespace) else: namespace = self.namespace graph = pick(graph, self.graph) start = datetime.datetime.now() try: result = conn.parse(data=data, publicID=graph, format=content_type) except: if is_file: print("Datafile ", file_name) raise if is_file: log.info(" loaded %s into rdflib namespace '%s'", file_name, namespace) else: log.info(" loaded data into rdflib namespace '%s' in time: %s", namespace, (datetime.datetime.now() - start)) return result
def load_local_file(self, file_path, namespace=None, graph=None, **kwargs): """ Uploads data to the Blazegraph Triplestore that is stored in files in directory that is available locally to blazegraph args: file_path: full path to the file namespace: the Blazegraph namespace to load the data graph: uri of the graph to load the data. Default is None kwargs: container_dir: the directory as seen by blazegraph - defaults to instance attribute if not passed """ time_start = datetime.datetime.now() url = self._make_url(namespace) params = {} if graph: params['context-uri'] = graph new_path = [] container_dir = pick(kwargs.get('container_dir'), self.container_dir) if container_dir: new_path.append(self.container_dir) new_path.append(file_path) params['uri'] = "file:///%s" % os.path.join(*new_path) log.debug(" loading %s into blazegraph", file_path) result = requests.post(url=url, params=params) if result.status_code > 300: raise SyntaxError(result.text) log.info("loaded '%s' in time: %s blazegraph response: %s", file_path, datetime.datetime.now() - time_start, self.format_response(result.text)) return result
def load_data(self, data, datatype="ttl", namespace=None, graph=None, is_file=False, **kwargs): """ Loads data via file stream from python to triplestore Args: ----- data: The data or filepath to load datatype(['ttl', 'xml', 'rdf']): the type of data to load namespace: the namespace to use graph: the graph to load the data to. is_file(False): If true python will read the data argument as a filepath, determine the datatype from the file extension, read the file and send it to blazegraph as a datastream """ log.setLevel(kwargs.get("log_level", self.log_level)) time_start = datetime.datetime.now() datatype_map = { 'ttl': 'text/turtle', 'xml': 'application/rdf+xml', 'rdf': 'application/rdf+xml', 'nt': 'text/plain' } if is_file: datatype = data.split(os.path.extsep)[-1] file_name = data log.debug('starting data load of %s', file_name) data = open(data, 'rb').read() else: try: data = data.encode('utf-8') except AttributeError: # data already encoded pass try: content_type = datatype_map[datatype] except KeyError: raise NotImplementedError("'%s' is not an implemented data format", datatype) context_uri = pick(graph, self.graph) result = requests.post(url=self._make_url(namespace), headers={"Content-Type": content_type}, params={"context-uri": context_uri}, data=data) if result.status_code == 200: if is_file: log.info(" loaded %s into blazegraph - %s", file_name, self.format_response(result.text)) else: log.info(" loaded data - %s", self.format_response(result.text)) log.setLevel(self.log_level) return result else: raise SyntaxError(result.text)
def reset_namespace(self, namespace=None, params=None): """ Will delete and recreate specified namespace args: namespace(str): Namespace to reset params(dict): params used to reset the namespace """ log = logging.getLogger("%s.%s" % (self.log_name, inspect.stack()[0][3])) log.setLevel(self.log_level) namespace = pick(namespace, self.namespace) params = pick(params, self.namespace_params) log.warning(" Reseting namespace '%s' at host: %s", namespace, self.url) try: self.delete_namespace(namespace) except RuntimeError: pass self.create_namespace(namespace, params)
def __init__(self, url=None, namespace=None, namespace_params=None, local_directory=None, container_dir=None, graph=None, **kwargs): self.active = kwargs.get('active', True) self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH, "") self.url = "No Url for Rdflib tstore" self.namespace = pick(namespace, self.default_ns) self.namespace_params = namespace_params self.container_dir = container_dir self.graph = pick(graph, self.default_graph) try: self.conn = self.tstore.get_namespace(self.namespace) except KeyError: self.tstore.create_namespace(self.namespace) self.conn = self.tstore.get_namespace(self.namespace) self.__set_mgr__(**kwargs)
def _make_url(self, namespace=None, url=None, **kwargs): """ Creates the REST Url based on the supplied namespace args: namespace: string of the namespace kwargs: check_status_call: True/False, whether the function is called from check_status. Used to avoid recurrsion error """ if not kwargs.get("check_status_call"): if not self.url: self.check_status rtn_url = self.url if url: rtn_url = url if rtn_url is None: rtn_url = self.ext_url namespace = pick(namespace, self.namespace) if namespace: rtn_url = os.path.join(rtn_url.replace("sparql", ""), "namespace", namespace, "sparql").replace("\\", "/") elif not rtn_url.endswith("sparql"): rtn_url = os.path.join(rtn_url, "sparql").replace("\\", "/") return rtn_url
def __predicate_object_map__(self, map_iri): """Iterates through rr:predicateObjectMaps for this TripleMap creating a SimpleNamespace for each triple map and assigning the constant, template, parentTripleMap, reference as properties. Args: ----- map_iri: rdflib.URIRef, TripleMap IRI Returns: -------- list: List of predicate_object Namespace objects """ pred_obj_maps = [] for pred_obj_map_bnode in self.rml.objects( subject=map_iri, predicate=NS_MGR.rr.predicateObjectMap.rdflib): pred_obj_map = SimpleNamespace() pred_obj_map.predicate = self.rml.value( subject=pred_obj_map_bnode, predicate=NS_MGR.rr.predicate.rdflib) obj_map_bnode = self.rml.value( subject=pred_obj_map_bnode, predicate=NS_MGR.rr.objectMap.rdflib) if obj_map_bnode is None: continue pred_obj_map.constant = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rr.constant.rdflib) pred_obj_map.template = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rr.template.rdflib) pred_obj_map.parentTriplesMap = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rr.parentTriplesMap.rdflib) if pred_obj_map.parentTriplesMap is not None: self.parents.add(str(pred_obj_map.parentTriplesMap)) pred_obj_map.reference = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rr.reference.rdflib) pred_obj_map.datatype = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rr.datatype.rdflib) pred_obj_map.query = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rml.query.rdflib) pred_obj_map.json_query = self.rml.value( subject=obj_map_bnode, predicate=NS_MGR.rml.reference.rdflib) json_key = None if hasattr(self.triple_maps[str(map_iri)].logicalSource, 'json_key'): json_key = self.triple_maps[str( map_iri)].logicalSource.json_key pred_obj_map.json_key = pick( self.rml.value(subject=obj_map_bnode, predicate=NS_MGR.rml.key.rdflib), json_key) # BIBCAT Extensions pred_obj_map.delimiters = [] if pred_obj_map.json_query: self.use_json_qry = True for obj in self.rml.objects(subject=obj_map_bnode, predicate=NS_MGR.kds.delimiter.rdflib): pred_obj_map.delimiters.append(obj) pred_obj_maps.append(pred_obj_map) return pred_obj_maps
def query(self, sparql, mode="get", namespace=None, rtn_format="json", **kwargs): """ Runs a sparql query and returns the results Args: ----- sparql: the sparql query to run namespace: the namespace to run the sparql query against mode: ['get'(default), 'update'] the type of sparql query rtn_format: ['json'(default), 'xml'] format of query results Kwargs: ------- debug(bool): If True sets logging level to debug """ namespace = pick(namespace, self.namespace) if kwargs.get("log_level"): log.setLevel(kwargs['log_level']) if kwargs.get("debug"): log.setLevel(logging.DEBUG) if rtn_format not in self.qry_formats: raise KeyError("rtn_format was '%s'. Allowed values are %s" % \ (rtn_format, self.qry_results_formats)) url = self._make_url(namespace) if 'prefix' not in sparql.lower(): sparql = "%s\n%s" % (NSM.prefix(), sparql) if mode == "get": data = {"query": sparql} #, "format": rtn_format} elif mode == "update": data = {"update": sparql} else: raise NotImplementedError("'mode' != to ['get', 'update']") headers = {'Accept': self.qry_formats[rtn_format]} start = datetime.datetime.now() try: result = requests.post(url, data=data, headers=headers) except requests.exceptions.ConnectionError: result = requests.post(self._make_url(namespace, self.local_url), data=data, headers=headers) log.debug( format_multiline([ "", "url='{url}'", """mode='{mode}', namespace='{namespace}', rtn_format='{rtn_format}'""", "**** SPAQRL QUERY ****", "", "{sparql}", "Query Time: {q_time}" ], url=url, mode=mode, namespace=namespace, rtn_format=rtn_format, sparql=sparql, q_time=(datetime.datetime.now() - start), **kwargs)) if result.status_code == 200: try: if rtn_format == "json": bindings = result.json().get('results', {}).get('bindings', []) elif rtn_format == 'xml': xml_doc = etree.XML(result.text) bindings = xml_doc.findall("results/bindings") else: bindings = result.text try: log.debug("result count: %s", len(bindings)) except TypeError: pass return bindings except json.decoder.JSONDecodeError: if mode == 'update': return BeautifulSoup(result.text, 'lxml').get_text() return result.text else: raise SyntaxError("%s\n\n%s\n\n%s" % (sparql, add_sparql_line_nums(sparql), result.text[result.text.find("java."):]))
def load_data(self, data, datatype="ttl", namespace=None, graph=None, is_file=False, **kwargs): """ loads data via file stream from python to triplestore Args: data: The data or filepath to load datatype(['ttl', 'xml', 'rdf']): the type of data to load namespace: the namespace to use graph: the graph to load the data to. is_file(False): If true python will read the data argument as a filepath, determine the datatype from the file extension, read the file and send it to blazegraph as a datastream """ if kwargs.get('debug'): log.setLevel(logging.DEBUG) time_start = datetime.datetime.now() datatype_map = { 'ttl': 'turtle', 'xml': 'xml', 'rdf': 'xml', 'nt': 'nt', 'n3': 'n3', 'nquads': 'nquads', 'hturtle': 'hturtle' } if is_file: datatype = data.split(os.path.extsep)[-1] file_name = data log.debug('starting data load of %s', file_name) data = open(data, 'rb').read() try: content_type = datatype_map[datatype] except KeyError: raise NotImplementedError("'%s' is not an implemented data fromat", datatype) conn = self.conn if namespace: conn = self.tstore.get_namespace(namespace) else: namespace = self.namespace graph = pick(graph, self.graph) start = datetime.datetime.now() try: result = conn.parse(data=data, publicID=graph, format=content_type) except: if is_file: print("Datafile ", file_name) raise if is_file: log.info (" loaded %s into rdflib namespace '%s'", file_name, namespace) else: log.info(" loaded data into rdflib namespace '%s' in time: %s", namespace, (datetime.datetime.now() - start)) return result