class Session: def __init__(self, args, loadOnly): with open(args) as f: self.conf = yaml.load(f) self.loadOnly = loadOnly es_hosts = self.conf.get("elasticsearch") print("###") print("# SPARQL endpoint: " + self.conf["sparql"]["uri"]) print("# ElasticSearch: %s" % es_hosts) print("###") self.es = Elasticsearch(es_hosts) self.urlOpener = FancyURLopener() self.urlOpener.addheader( "Accept", "application/sparql-results+json, applicaton/json;q=0.1") def uri_to_qname(self, uri): for p, u in self.conf.get("prefixes", {}).items(): if uri.startswith(u): return uri.replace(u, p + ":", 1) return uri def sparql_prefixes(self): sparql = [] for p, u in self.conf.get("prefixes", {}).items(): sparql.append("PREFIX %s: <%s>" % (p, u)) return "\n".join(sparql) def run(self): for index in self.conf["indexes"]: if not self.loadOnly: try: res = self.es.indices.delete(index=index, ignore=404) print("Delete index " + index + ", response : '%s'" % (res)) settings = { "settings": { "number_of_shards": 1, "analysis": { "filter": { "autocomplete_filter": { "type": "edge_ngram", "min_gram": 3, "max_gram": 20 } }, "analyzer": { "autocomplete": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "autocomplete_filter"] } } } }, "mappings": { "compound": { "properties": { "label": { "type": "string", "analyzer": "autocomplete", "search_analyzer": "standard" }, "title": { "type": "string", "analyzer": "autocomplete", "search_analyzer": "standard" }, "Synonym": { "type": "string", "analyzer": "autocomplete", "search_analyzer": "standard" }, "brand_name": { "type": "string", "analyzer": "autocomplete", "search_analyzer": "standard" }, "Definition": { "type": "string", "analyzer": "autocomplete", "search_analyzer": "standard" } } } } } res = self.es.indices.create(index=index, body=settings) print("Create index " + index + ", response: '%s'" % (res)) except NotFoundError: pass for doc_type in self.conf["indexes"][index]: print("index " + index) indexer = Indexer(self, index, doc_type) ## TODO: Store mapping for JSON-LD indexer.load() def dryrun(self): for index in self.conf["indexes"]: for doc_type in self.conf["indexes"][index]: print("## index/type:", index, doc_type) indexer = Indexer(self, index, doc_type) # below should print the sparql indexer.sparql() def check(self): self.check_prefixes() self.check_required_properties() def expand_qname(self, p): if not ":" in p: raise Exception("Invalid property, no prefix: " + p) prefix, rest = p.split(":", 1) if not prefix in self.conf.get("prefixes", {}): raise Exception("Unknown prefix: " + prefix) base = self.conf.get("prefixes")[prefix] return base + rest def check_property(self, p): if type(p) == str: urlparse(self.expand_qname(p)) else: ## Assume it is dict-based - check they are all non-empty if not p.get("sparql"): raise Exception("'sparql' missing for %s" % p) if not p.get("variable"): raise Exception("'variable' missing for %s" % p) if not p.get("jsonld"): raise Exception("'jsonld' missing for %s" % p) def check_required_properties(self): # Check that every index+type have at least one # required triple (rdf:type or a property) for p in self.conf.get("common_properties", []): if type(p) != str and is_property_required(p): return # Great! required for every index # if not, we'll need to check each index+type for index, index_conf in self.conf["indexes"].items(): for doc_type, type_conf in index_conf.items(): if "type" in type_conf: continue # OK if filter(is_property_required, type_conf.get("properties", [])): continue # OK raise Exception( "No type: or property with required:true for %s %s" % (index, doc_type)) def check_prefixes(self): for uri in self.conf.get("prefixes", {}).values(): if not (uri.endswith("#") or uri.endswith("/")): # This should catch prefix definitions not ending with / # print("WARNING: Prefix doesn't end with / or #: %s" % uri, file=sys.stderr) for p in self.conf.get("common_properties", []): self.check_property(p) for index, index_conf in self.conf["indexes"].items(): for doc_type, type_conf in index_conf.items(): if "type" in type_conf: urlparse(self.expand_qname(type_conf["type"])) for p in type_conf.get("properties", []): self.check_property(p)
def get_month_data(month, cookie, token): params = 'date={}&_token={}'.format(month, token) opener = FancyURLopener() opener.addheader('Cookie', cookie) stream = opener.open(APIURL, params) return stream.read().decode('utf-8')