def api_analyze_catalog(): """Analyze a catalog.""" if 'token' not in session: session['token'] = str(uuid.uuid4()) iri = request.args.get('sparql', None) graph = request.args.get('graph', None) if iri is not None and graph is not None and rfc3987.match( iri) and rfc3987.match(graph): current_app.logger.info( f'Analyzing endpoint {iri}, named graph {graph}') red = redis.Redis(connection_pool=redis_pool) #Throttling key = f'batch:{session["token"]}' queueLength = red.scard(key) while queueLength > 1000: current_app.logger.warning(f'Queue length: {queueLength}') time.sleep(60) queueLength = red.scard(key) t = inspect_graph.si(iri, graph).apply_async() current_app.logger.info( f'Batch id: {session["token"]}, task id: {t.id}') red.hset('taskBatchId', t.id, session["token"]) return '' else: abort(400)
def get_generate_uri(source: dict, entity: str, data: dict) -> str: """ Get/Generate URI from BrAPI object or generate one """ pui_field = entity + 'PUI' data_uri = data.get(pui_field) if data_uri and rfc3987.match(data_uri, rule='URI'): # The original PUI is a valid URI return data_uri source_id = urllib.parse.quote(source['schema:identifier']) data_id = get_identifier(entity, data) if not data_uri: # Generate URI from source id, entity name and data id encoded_entity = urllib.parse.quote(entity) encoded_id = urllib.parse.quote(data_id) data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}" else: # Generate URI by prepending the original URI with the source identifier encoded_uri = urllib.parse.quote(data_uri) data_uri = f"urn:{source_id}/{encoded_uri}" if not rfc3987.match(data_uri, rule='URI'): raise Exception( f'Could not get or create a correct URI for "{entity}" object id "{data_id}"' f' (malformed URI: "{data_uri}")') return data_uri
def validAllowList(self, allowlist): if not self.is_type(allowlist): raise ValueError() for origin in allowlist: if not origin in HTMLFeaturePolicy.valid_origins or rfc3987.match(origin, rule='URI'): return False return True
def is_valid_url(a_string): """Check if a string is a valid URL.""" match_obj = rfc3987.match(a_string, 'URI') if match_obj: return True else: return False
def ExtractTextFeatures(): uri2label = GetEList() id2text = open("../dataset/Graph_ID_Text.txt", "w", encoding="utf-8") NodeID = open("../dataset/Graph_Origin_Node_ID.txt", "r", encoding="utf-8") count = 0 for line in NodeID.readlines(): uri, id = line.strip().split("\t\t") text = "" if match(uri, rule='IRI_reference') is None: text = uri elif uri in uri2label.keys(): text = uri2label[uri] elif len(uri.split("#")) > 1: text = re.sub(r"(\w)([A-Z])", r"\1 \2", (uri.split("#"))[-1]) else: g = Graph() try: g.parse(uri) text = g.label(URIRef(uri)) except: print("不能解析!") if text == "": uri_s = uri.split("resource/") if len(uri_s) > 1: text = uri_s[-1] else: text = (uri.split("/"))[-1] id2text.write(id + "\t\t" + text + "\n") print(count) count += 1 id2text.close() NodeID.close()
def ExtractTextFeatures(): id2text = open("../dataset/FGraph_ID_Text.txt", "a", encoding="utf-8") NodeID = open("../dataset/FGraph_Origin_Node_ID.txt", "r", encoding="utf-8") count = 0 for line in NodeID.readlines(): if count < 36661: count += 1 continue if count >= 81000: break uri, id = line.strip().split("\t\t") text = "" if match(uri, rule='IRI_reference') is None: text = uri elif len(uri.split("#")) > 1: text = re.sub(r"(\w)([A-Z])", r"\1 \2", (uri.split("#"))[-1]) else: g = Graph() try: g.parse(uri) text = g.label(URIRef(uri)) except: print("不能解析!") if text == "": uri_s1 = uri.split("resource/") if len(uri_s1) > 1: text = uri_s1[-1] else: uri_s2 = (uri.split("/"))[-1] if uri_s2 != "": text = uri_s2 else: text = uri id2text.write(id + "\t\t" + text + "\n") print(count) count += 1 id2text.close() NodeID.close()
def __init__(self, url): self.original_url = url self.type = None self.url = '' self.timestamp = '' self.mod = '' if not any (f(url) for f in [self._init_query, self._init_replay]): raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url) if len(self.url) == 0: raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url) # protocol agnostic url -> http:// #if self.url.startswith('//'): # self.url = self.DEFAULT_SCHEME + self.url[2:] # no protocol -> http:// if not '://' in self.url: self.url = self.DEFAULT_SCHEME + self.url # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding # %2F is fine, but %2f -- standard supports either matcher = rfc3987.match(self.url.upper(), 'IRI') if not matcher: raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
def ask_source(): while True: source = input("Enter the template git repository: ") if match(source, rule='IRI'): return source print(f"Please try another git repository than {source}.") return ""
def _is_valid(self, instance) -> bool: # Something is considered to be an instance of FullIRI iff: # It isn't an RDFLib Literal -- you can never cross those beams # It is already declated to be a URIRef no matter what it looks like # It looks like an IRI return instance is not None and not isinstance(instance, (rdflib.Literal, Literal)) \ and (isinstance(instance, URIRef) or rfc3987.match(str(instance), 'IRI'))
def __init__(self, label, src, wf_name): # a label to reference this input by in other parts of the wf file. self.label = label # src can either be: # 1) URI # 2) absolute path on the host # 3) relative path on the host, relative to the CWD of eod # In the FUTURE: should also allow # 4) a global output of another eod.yml, in which case it should be <wf_name>.<outputs>.<label> # (this is not yet supported) self.src = src self.is_uri = False if rfc3987.match(self.src, 'URI'): self.is_uri = True self.uri = self.src self.set_abs_host_path(wf_name) if not self.abs_host_path: raise Error("Could not compute host path for src:{}, label:{}".format(src, label)) self.eod_container_path = to_eod(self.abs_host_path) base_dir = os.path.dirname(self.eod_container_path) # make sure the parent directory exists, if not os.path.exists(base_dir): print("Creating global input base_dir:{}".format(base_dir)) os.makedirs(base_dir) # create a file with the URI if self.is_uri: with open(self.eod_container_path, 'w') as f: print(self.uri, file=f)
def assemble_command(arguments, statistics, nicknames): streamer = arguments.streamer if match(streamer, 'absolute_URI'): url = streamer statistics.add_usage(url) elif nicknames.find(streamer): url = nicknames.get(streamer) else: print("Nickname", streamer, "has not been defined yet", file=stderr) return 1 statistics.save() player_command = '' if arguments.mode == 'video': quality = 'best' player_command = '--file-caching=10000' elif url.find('twitch') >= 0: quality = 'audio' else: quality = 'worst' player_command = '--novideo' exec_string = "streamlink " + "--player \"'C:\\Program Files\\VideoLAN\\VLC\\vlc.exe' " + player_command + "\"" \ + ' ' + url + ' ' + quality if arguments.dry_run: print("The resulting command string:") print("[", exec_string, "]") return 0 else: print('The real execution starts here') return call(exec_string, shell=False)
def POST(self): if not urlForm.validates(): return render.index(urlForm) form_data = web.input() check_spec = True if 'checkspec' in form_data else False if rfc3987.match(form_data.URL, rule='URI_reference'): parsed = urlparse.urlparse(form_data['URL'], scheme='http') ref_url = parsed.scheme + '://' + '/'.join([parsed.netloc, parsed.path.lstrip('/')]).lstrip('/') else: ref_url = None if ref_url: params = {'url': ref_url, 'checkspec': int(check_spec)} raise web.seeother('/request?' + urlencode(params), '/') else: params = { 'msg': "Sorry, not implememted yet...", 'back_link': '/' } raise web.seeother('/error?' + urlencode(params))
def iri(instance): try: import rfc3987 except ImportError: logging.warning( "package rfc3987 missing - cannot validate iri - so it passes") return True return rfc3987.match(instance, rule="IRI")
def validate_iri(value): if value is None or rfc3987.match(value, rule='IRI') is not None: return True else: return False
def uri_reference(instance): try: import rfc3987 except ImportError: logging.warning( "package rfc3987 missing - cannot validate uri-reference - so it passes" ) return True return rfc3987.match(instance, rule="URI_reference")
def uri_validator(node, uri): """ URL validator rfc3987 is used to check if a URL is correct (https://pypi.python.org/pypi/rfc3987/). :param node: The schema node to which this exception relates. :param uri: Uri to validate. """ if not rfc3987.match(uri, rule='URI'): raise colander.Invalid(node, '{0} is geen geldige URI.'.format(uri))
def setupsubject(self, sub): logger.info('INFO add_triple.py - setup subject %s', sub) try: if str(rfc3987.match(sub, rule='URI')) != 'None': self.sub = sub else: raise Exception("This is not a valid URI") except Exception as e: logger.error("ERROR! add_triple.py - " + e.message)
def is_uri(uri): ''' Check if a string is a valid URI according to rfc3987 :param string uri: :rtype: boolean ''' if uri is None: return False return rfc3987.match(uri, rule='URI')
def adduri(self, p, o): try: s = self.conn.createURI(self.sub) if str(rfc3987.match(p, rule='URI')) != 'None': p = self.conn.createURI(p) else: raise Exception("This is not a valid URI") if str(rfc3987.match(o, rule='URI')) != 'None': o = self.conn.createURI(o) else: raise Exception("This is not a valid URI") logger.info("INFO add_triple.py - add uri <%s> %s %s", self.sub, p, o) if len(self.contexts) > 0: self.conn.add(s, p, o, contexts=self.contexts) else: self.conn.add(s, p, o) except Exception as e: logger.error("ERROR! add_triple.py - " + e.message)
def parse_streamer_url(url, nicknames): if match(url, 'absolute_URI'): rv1 = [x for x in parse(url)['path'].split('/') if x][-1] rv2 = url return rv1, rv2 elif nicknames.find(url): return url, nicknames.get(url) else: print("Nickname \"{0}\" has not been defined yet".format(url)) return None, None
def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False): # checks that a required or optional field, if specified, looks like a URL if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK if not check_required_field(obj, field_name, (str, unicode), dataset_name, errs): return False # just checking data type if allow_redacted and is_redacted(obj[field_name]): return True if not rfc3987_url.match(obj[field_name]): add_error(errs, 5, "Invalid Required Field Value", "The '%s' field has an invalid rfc3987 URL: \"%s\"." % (field_name, obj[field_name]), dataset_name) return False return True
def link_title_parse_hook(bot, channel, sender, message): for word in message.split(" "): try: if rfc3987.match(word, rule='URI'): r = requests.get(word) soup = BeautifulSoup(r.text, 'html.parser') title = soup.head.title.text.strip() title = title.replace("\n", "") bot.message(channel, " :: {}".format(title)) except requests.exceptions.InvalidSchema: pass
def get_uri(source, entity_name, object): """Get URI from BrAPI object or generate one""" pui_field = entity_name + 'PUI' object_uri = object.get(pui_field) if object_uri and rfc3987.match(object_uri, rule='URI'): # The original URI is valid return object_uri source_id = source['schema:identifier'] object_id = get_identifier(entity_name, object) if not object_uri: object_uri = get_uri_by_id(source, entity_name, object_id) else: # Generate URI by prepending the original URI with the source identifier object_uri = 'urn:{}/{}'.format(source_id, urllib.parse.quote(object_uri, safe='')) if not rfc3987.match(object_uri, rule='URI'): raise Exception('Could not get or create a correct URI for "{}" object id "{}" (malformed URI: "{}")' .format(entity_name, object_id, object_uri)) return object_uri
def _validate_stringformat(self, stringtype, field, value): """ {'allowed': ['url', 'json']} """ if stringtype == 'json': try: json.loads(value) return True except ValueError: self._error(field, STRINGFORMAT_JSON) elif stringtype == 'url': if match(value, rule='URI'): return True else: self._error(field, STRINGFORMAT_URL)
def startAuthenticationProcess(self): if self.settingsDialog.group_account.input_url.text and self.settingsDialog.group_account.input_username.text and self.settingsDialog.group_account.input_password.text: self.saveSettings() self.loadSettings() if match(self.url, "URI"): try: request = requests.get(self.url, timeout=3); if request.status_code == 200: oc = owncloud.Client(self.url) oc.login(self.username, self.password) self.connectStatus = "true" self.saveSettings() self.updateUi() except requests.exceptions.RequestException as e: QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The specified Server URL is invalid!") settings = QSettings() settings.remove("connect-status") self.saveSettings() self.updateUi() except Exception as e: errorMessage = self.formatConnectionError(e.message) if errorMessage == "401": self.settingsDialog.group_connection.widget_status.label_status.setText("Invalid") else: QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", errorMessage) else: QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The specified Server URL is invalid!") else: missingFields = "" fieldText = "field" if not self.settingsDialog.group_account.input_url.text: missingFields = "\"Server URL\"" if not self.settingsDialog.group_account.input_username.text: if missingFields == "": missingFields = "\"Username\"" else: missingFields = missingFields + " and \"Username\"" fieldText = "fields" if not self.settingsDialog.group_account.input_password.text: if missingFields == "": missingFields = "\"Password\"" else: missingFields = missingFields.replace(" and", ",") + " and \"Password\"" fieldText = "fields" QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The " + missingFields + " " + fieldText + " must be filled in!")
def __init__(self, url): self.original_url = url self.type = None self.url = None self.timestamp = None self.mod = None if not any(f(self, url) for f in [aurl._init_query, aurl._init_replay]): raise RequestParseException("Invalid WB Request Url: " + url) matcher = rfc3987.match(self.url, "IRI_reference") if not matcher: raise RequestParseException("Bad Request Url: " + self.url)
def create_reference(md5, pos): """ Adds a reference notation to an upload """ url = request.args.get('url', '') if not rfc3987.match(url, rule='URI'): return "Sorry, that's not a valid URL:\n" % url u = Upload.objects.filter(md5=md5).first() if not u: abort(404) # Create the reference r = Reference(upload=u, ref_url=url, pos=pos) r.save() return url
def addliteral(self, p, o): try: s = self.conn.createURI(self.sub) if str(rfc3987.match(p, rule='URI')) != 'None': p = self.conn.createURI(p) else: raise Exception("This is not a valid URI") o = self.conn.createLiteral(o) logger.debug("DEBUG add_triple.py - add literal <%s> %s '%s'", self.sub, p, o) if len(self.contexts) > 0: self.conn.add(s, p, o, contexts=self.contexts) else: self.conn.add(s, p, o) except Exception as e: logger.error("ERROR! add_triple.py - " + e.message)
def download_image(url): # Skip empty strings and strings that are not absolute urls. if not url or not rfc3987.match(url, 'absolute_URI'): return None, None # Get url. try: response = requests.get(url) except requests.exceptions.ConnectionError: return None, None if response.status_code == 200: image = BytesIO(response.content) # Check that the file is really an image. image_type = imghdr.what(image) if image_type: return ImageFile(image), image_type return None, None
def link_title_parse_hook(bot, channel, sender, message): if not allowed_to_process(bot, channel): return for word in message.split(" "): try: if rfc3987.match(word, rule='URI'): r = requests.get(word) if r.status_code != 200: return soup = BeautifulSoup(r.text, 'html.parser') title = soup.head.title.text.strip() title = title.replace("\n", "") bot.message(channel, " :: {}".format(title)) except requests.exceptions.InvalidSchema: pass
def startAuthenticationProcess(self): if self.settingsDialog.group_account.input_url.text and self.settingsDialog.group_account.input_username.text and self.settingsDialog.group_account.input_password.text: self.saveSettings() self.loadSettings() if match(self.url, "URI"): try: request = requests.get(self.url, timeout=3) if request.status_code == 200: oc = nextcloud.Client(self.url) oc.login(self.username, self.password) self.connectStatus = "true" self.saveSettings() self.updateUi() except requests.exceptions.RequestException as e: QMessageBox.critical( self.settingsDialog, "NextCloud Connection Error", "The specified Server URL is invalid!") settings = QSettings() settings.remove("connect-status") self.saveSettings() self.updateUi() except Exception as e: errorMessage = self.formatConnectionError(e.message) if errorMessage == "401": self.settingsDialog.group_connection.widget_status.label_status.setText( "Invalid") else: QMessageBox.critical(self.settingsDialog, "NextCloud Connection Error", errorMessage) else: QMessageBox.critical(self.settingsDialog, "NextCloud Connection Error", "The specified Server URL is invalid!") else: missingFields = "" fieldText = "field" if not self.settingsDialog.group_account.input_url.text: missingFields = "\"Server URL\"" if not self.settingsDialog.group_account.input_username.text: if missingFields == "": missingFields = "\"Username\"" else: missingFields = missingFields + " and \"Username\"" fieldText = "fields" if not self.settingsDialog.group_account.input_password.text: if missingFields == "": missingFields = "\"Password\"" else: missingFields = missingFields.replace( " and", ",") + " and \"Password\"" fieldText = "fields" QMessageBox.critical( self.settingsDialog, "NextCloud Connection Error", "The " + missingFields + " " + fieldText + " must be filled in!")
def triple_value(value): """ This function takes as input the predicate's value and returns it in the write format, be it a integer, decimal, double, boolean, date, time or dateTime datatype or whether it is a URI """ # Check whether the value is null or empty if value is None: return "" else: value = value.strip() # Return an empty string if the value is an empty string if value == "": return "" if value == "\\": value += "\\" # Replace double quote with a single quote value = to_unicode(value) value = value.replace('"', "'") # URI values if ("http://" in value or "https://" in value) and " " not in value: if match(value) is not None: return to_unicode(u"<{0}>".format(value)) elif re.search("[“”’`\r\n'\"]+", value, re.IGNORECASE): return to_unicode(u"\"\"\"{0}\"\"\"".format(value)) else: return to_unicode(u"\"{0}\"".format(value)) # ^^xsd:string # NUMBERS: can be written like other literals with lexical form and datatype elif RDF.rgxInteger.match(value): return u"\"{0}\"^^xsd:integer".format(value) elif RDF.rgxDecimal.match(value): return u"\"{0}\"^^xsd:decimal".format(value) elif RDF.rgxDouble.match(value): return u"\"{0}\"^^xsd:double".format(value) # BOOLEAN: values may be written as either 'true' or 'false' (case-sensitive) # and represent RDF literals with the datatype xsd:boolean. """ elif value == "true" or value == "false": return u"\"{0}\"^^xsd:boolean".format(value) # DATE: specified in the following form "YYYY-MM-DD" # Note: All components are required! elif RDF.rgxDate.match(value): return u"\"{0}\"^^xsd:date".format(value) # TIME: elif RDF.rgxTime.match(value): return u"\"{0}\"^^xsd:time".format(value) # DATE - TIME: elif RDF.rgxDateTime.match(value): return u"\"{0}\"^^xsd:dateTime".format(value) # TEXT \u005c # ^^xsd:string elif re.search("[“”’`\r\n'\"]+", value, re.IGNORECASE): return to_unicode(u"\"\"\"{0}\"\"\"".format(value).replace( u"\\", u"\\\\")) else: # ^^xsd:string return to_unicode(u"\"{0}\"".format(value)).replace(u"\\", u"\\\\")
def do_validation(doc, errors_array, seen_identifiers): errs = {} if type(doc) != list: add_error( errs, 0, "Bad JSON Structure", "The file must be an array at its top level. " "That means the file starts with an open bracket [ and ends with a close bracket ]." ) elif len(doc) == 0: add_error(errs, 0, "Catalog Is Empty", "There are no entries in your file.") else: for i, item in enumerate(doc): # Required dataset_name = "dataset %d" % (i + 1) # title if check_required_string_field(item, "title", 1, dataset_name, errs): dataset_name = '"%s"' % item.get("title", "").strip() # accessLevel # required if check_required_string_field(item, "accessLevel", 3, dataset_name, errs): if item["accessLevel"] not in ("public", "restricted public", "non-public"): add_error( errs, 5, "Invalid Required Field Value", "The field 'accessLevel' had an invalid value: \"%s\"" % item["accessLevel"], dataset_name) # bureauCode # required if not is_redacted(item.get('bureauCode')): if check_required_field(item, "bureauCode", list, dataset_name, errs): for bc in item["bureauCode"]: if not isinstance(bc, (str, unicode)): add_error(errs, 5, "Invalid Required Field Value", "Each bureauCode must be a string", dataset_name) elif ":" not in bc: add_error( errs, 5, "Invalid Required Field Value", "The bureau code \"%s\" is invalid. " "Start with the agency code, then a colon, then the bureau code." % bc, dataset_name) elif bc not in omb_burueau_codes: add_error( errs, 5, "Invalid Required Field Value", "The bureau code \"%s\" was not found in our list " "(https://project-open-data.cio.gov/data/omb_bureau_codes.csv)." % bc, dataset_name) # contactPoint # required if check_required_field(item, "contactPoint", dict, dataset_name, errs): cp = item["contactPoint"] # contactPoint - fn # required check_required_string_field(cp, "fn", 1, dataset_name, errs) # contactPoint - hasEmail # required if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs): if not is_redacted(cp.get('hasEmail')): email = cp["hasEmail"].replace('mailto:', '') if not email_validator(email): add_error( errs, 5, "Invalid Required Field Value", "The email address \"%s\" is not a valid email address." % email, dataset_name) # description # required check_required_string_field(item, "description", 1, dataset_name, errs) # identifier #required if check_required_string_field(item, "identifier", 1, dataset_name, errs): if item["identifier"] in seen_identifiers: add_error( errs, 5, "Invalid Required Field Value", "The dataset identifier \"%s\" is used more than once." % item["identifier"], dataset_name) seen_identifiers.add(item["identifier"]) # keyword # required if isinstance(item.get("keyword"), (str, unicode)): if not is_redacted(item.get("keyword")): add_error( errs, 5, "Update Your File!", "The keyword field used to be a string but now it must be an array.", dataset_name) elif check_required_field(item, "keyword", list, dataset_name, errs): for kw in item["keyword"]: if not isinstance(kw, (str, unicode)): add_error( errs, 5, "Invalid Required Field Value", "Each keyword in the keyword array must be a string", dataset_name) elif len(kw.strip()) == 0: add_error( errs, 5, "Invalid Required Field Value", "A keyword in the keyword array was an empty string.", dataset_name) # modified # required if check_required_string_field(item, "modified", 1, dataset_name, errs): if not is_redacted(item['modified']) \ and not MODIFIED_REGEX_1.match(item['modified']) \ and not MODIFIED_REGEX_2.match(item['modified']) \ and not MODIFIED_REGEX_3.match(item['modified']): add_error( errs, 5, "Invalid Required Field Value", "The field \"modified\" is not in valid format: \"%s\"" % item['modified'], dataset_name) # programCode # required if not is_redacted(item.get('programCode')): if check_required_field(item, "programCode", list, dataset_name, errs): for pc in item["programCode"]: if not isinstance(pc, (str, unicode)): add_error( errs, 5, "Invalid Required Field Value", "Each programCode in the programCode array must be a string", dataset_name) elif not PROGRAM_CODE_REGEX.match(pc): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "One of programCodes is not in valid format (ex. 018:001): \"%s\"" % pc, dataset_name) # publisher # required if check_required_field(item, "publisher", dict, dataset_name, errs): # publisher - name # required check_required_string_field(item["publisher"], "name", 1, dataset_name, errs) # Required-If-Applicable # dataQuality # Required-If-Applicable if item.get("dataQuality") is None or is_redacted( item.get("dataQuality")): pass # not required or REDACTED elif not isinstance(item["dataQuality"], bool): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'dataQuality' must be true or false, " "as a JSON boolean literal (not the string \"true\" or \"false\").", dataset_name) # distribution # Required-If-Applicable if item.get("distribution") is None: pass # not required elif not isinstance(item["distribution"], list): if isinstance(item["distribution"], (str, unicode)) and is_redacted( item.get("distribution")): pass else: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'distribution' must be an array, if present.", dataset_name) else: for j, dt in enumerate(item["distribution"]): if isinstance(dt, (str, unicode)): if is_redacted(dt): continue distribution_name = dataset_name + (" distribution %d" % (j + 1)) # distribution - downloadURL # Required-If-Applicable check_url_field(False, dt, "downloadURL", distribution_name, errs, allow_redacted=True) # distribution - mediaType # Required-If-Applicable if 'downloadURL' in dt: if check_required_string_field(dt, "mediaType", 1, distribution_name, errs): if not IANA_MIME_REGEX.match(dt["mediaType"]) \ and not is_redacted(dt["mediaType"]): add_error( errs, 5, "Invalid Field Value", "The distribution mediaType \"%s\" is invalid. " "It must be in IANA MIME format." % dt["mediaType"], distribution_name) # distribution - accessURL # optional check_url_field(False, dt, "accessURL", distribution_name, errs, allow_redacted=True) # distribution - conformsTo # optional check_url_field(False, dt, "conformsTo", distribution_name, errs, allow_redacted=True) # distribution - describedBy # optional check_url_field(False, dt, "describedBy", distribution_name, errs, allow_redacted=True) # distribution - describedByType # optional if dt.get("describedByType") is None or is_redacted( dt.get("describedByType")): pass # not required or REDACTED elif not IANA_MIME_REGEX.match(dt["describedByType"]): add_error( errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " "It must be in IANA MIME format." % dt["describedByType"], distribution_name) # distribution - description # optional if dt.get("description") is not None: check_required_string_field(dt, "description", 1, distribution_name, errs) # distribution - format # optional if dt.get("format") is not None: check_required_string_field(dt, "format", 1, distribution_name, errs) # distribution - title # optional if dt.get("title") is not None: check_required_string_field(dt, "title", 1, distribution_name, errs) # license # Required-If-Applicable check_url_field(False, item, "license", dataset_name, errs, allow_redacted=True) # rights # Required-If-Applicable # TODO move to warnings # if item.get("accessLevel") != "public": # check_string_field(item, "rights", 1, dataset_name, errs) # spatial # Required-If-Applicable # TODO: There are more requirements than it be a string. if item.get("spatial") is not None and not isinstance( item.get("spatial"), (str, unicode)): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'spatial' must be a string value if specified.", dataset_name) # temporal # Required-If-Applicable if item.get("temporal") is None or is_redacted( item.get("temporal")): pass # not required or REDACTED elif not isinstance(item["temporal"], (str, unicode)): add_error( errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be a string value if specified.", dataset_name) elif "/" not in item["temporal"]: add_error( errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be two dates separated by a forward slash.", dataset_name) elif not TEMPORAL_REGEX_1.match(item['temporal']) \ and not TEMPORAL_REGEX_2.match(item['temporal']) \ and not TEMPORAL_REGEX_3.match(item['temporal']): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'temporal' has an invalid start or end date.", dataset_name) # Expanded Fields # accrualPeriodicity # optional if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \ and not is_redacted(item.get("accrualPeriodicity")): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'accrualPeriodicity' had an invalid value.", dataset_name) # conformsTo # optional check_url_field(False, item, "conformsTo", dataset_name, errs, allow_redacted=True) # describedBy # optional check_url_field(False, item, "describedBy", dataset_name, errs, allow_redacted=True) # describedByType # optional if item.get("describedByType") is None or is_redacted( item.get("describedByType")): pass # not required or REDACTED elif not IANA_MIME_REGEX.match(item["describedByType"]): add_error( errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " "It must be in IANA MIME format." % item["describedByType"], dataset_name) # isPartOf # optional if item.get("isPartOf"): check_required_string_field(item, "isPartOf", 1, dataset_name, errs) # issued # optional if item.get("issued") is not None and not is_redacted( item.get("issued")): if not ISSUED_REGEX.match(item['issued']): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'issued' is not in a valid format.", dataset_name) # landingPage # optional check_url_field(False, item, "landingPage", dataset_name, errs, allow_redacted=True) # language # optional if item.get("language") is None or is_redacted( item.get("language")): pass # not required or REDACTED elif not isinstance(item["language"], list): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' must be an array, if present.", dataset_name) else: for s in item["language"]: if not LANGUAGE_REGEX.match(s) and not is_redacted(s): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' had an invalid language: \"%s\"" % s, dataset_name) # PrimaryITInvestmentUII # optional if item.get("PrimaryITInvestmentUII") is None or is_redacted( item.get("PrimaryITInvestmentUII")): pass # not required or REDACTED elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match( item["PrimaryITInvestmentUII"]): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'PrimaryITInvestmentUII' must be a string " "in 023-000000001 format, if present.", dataset_name) # references # optional if item.get("references") is None: pass # not required or REDACTED elif not isinstance(item["references"], list): if isinstance(item["references"], (str, unicode)) and is_redacted( item.get("references")): pass else: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' must be an array, if present.", dataset_name) else: for s in item["references"]: if not rfc3987_url.match(s) and not is_redacted(s): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' had an invalid rfc3987 URL: \"%s\"" % s, dataset_name) # systemOfRecords # optional check_url_field(False, item, "systemOfRecords", dataset_name, errs, allow_redacted=True) # theme #optional if item.get("theme") is None or is_redacted(item.get("theme")): pass # not required or REDACTED elif not isinstance(item["theme"], list): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.", dataset_name) else: for s in item["theme"]: if not isinstance(s, (str, unicode)): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "Each value in the theme array must be a string", dataset_name) elif len(s.strip()) == 0: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "A value in the theme array was an empty string.", dataset_name) # Form the output data. for err_type in sorted(errs): errors_array.append(( err_type[1], # heading [ err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "") for err_item in sorted( errs[err_type], key=lambda x: (-len(errs[err_type][x]), x)) ]))
def _is_valid(self, instance) -> bool: return instance is not None and rfc3987.match(str(instance), 'IRI')
def valid_url(url): iri_match = iri.match(url, rule='absolute_IRI') return iri_match is not None
def __init__(self, is_optional=False, *args, **kwargs): Property.__init__(self, 'url', "", is_optional, *args, **kwargs) self._set_validator(lambda val: (isinstance(val, str) or isinstance(val, unicode)) and (len(val) > 0) and rfc3987.match(val, 'URI') is not None)
def url_valid(url): if rfc3987.match(url, rule='URI'): return True return False
def do_validation(doc, errors_array, seen_identifiers): errs = {} if type(doc) != list: add_error( errs, 0, "Bad JSON Structure", "The file must be an array at its top level. " "That means the file starts with an open bracket [ and ends with a close bracket ].", ) elif len(doc) == 0: add_error(errs, 0, "Catalog Is Empty", "There are no entries in your file.") else: for i, item in enumerate(doc): # Required dataset_name = "dataset %d" % (i + 1) # title if check_required_string_field(item, "title", 1, dataset_name, errs): dataset_name = '"%s"' % item.get("title", "").strip() # accessLevel # required if check_required_string_field(item, "accessLevel", 3, dataset_name, errs): if item["accessLevel"] not in ("public", "restricted public", "non-public"): add_error( errs, 5, "Invalid Required Field Value", "The field 'accessLevel' had an invalid value: \"%s\"" % item["accessLevel"], dataset_name, ) # bureauCode # required if not is_redacted(item.get("bureauCode")): if check_required_field(item, "bureauCode", list, dataset_name, errs): for bc in item["bureauCode"]: if not isinstance(bc, (str, unicode)): add_error( errs, 5, "Invalid Required Field Value", "Each bureauCode must be a string", dataset_name, ) elif ":" not in bc: add_error( errs, 5, "Invalid Required Field Value", 'The bureau code "%s" is invalid. ' "Start with the agency code, then a colon, then the bureau code." % bc, dataset_name, ) elif bc not in omb_burueau_codes: add_error( errs, 5, "Invalid Required Field Value", 'The bureau code "%s" was not found in our list ' "(https://project-open-data.cio.gov/data/omb_bureau_codes.csv)." % bc, dataset_name, ) # contactPoint # required if check_required_field(item, "contactPoint", dict, dataset_name, errs): cp = item["contactPoint"] # contactPoint - fn # required check_required_string_field(cp, "fn", 1, dataset_name, errs) # contactPoint - hasEmail # required if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs): if not is_redacted(cp.get("hasEmail")): email = cp["hasEmail"].replace("mailto:", "") if not email_validator(email): add_error( errs, 5, "Invalid Required Field Value", 'The email address "%s" is not a valid email address.' % email, dataset_name, ) # description # required check_required_string_field(item, "description", 1, dataset_name, errs) # identifier #required if check_required_string_field(item, "identifier", 1, dataset_name, errs): if item["identifier"] in seen_identifiers: add_error( errs, 5, "Invalid Required Field Value", 'The dataset identifier "%s" is used more than once.' % item["identifier"], dataset_name, ) seen_identifiers.add(item["identifier"]) # keyword # required if isinstance(item.get("keyword"), (str, unicode)): if not is_redacted(item.get("keyword")): add_error( errs, 5, "Update Your File!", "The keyword field used to be a string but now it must be an array.", dataset_name, ) elif check_required_field(item, "keyword", list, dataset_name, errs): for kw in item["keyword"]: if not isinstance(kw, (str, unicode)): add_error( errs, 5, "Invalid Required Field Value", "Each keyword in the keyword array must be a string", dataset_name, ) elif len(kw.strip()) == 0: add_error( errs, 5, "Invalid Required Field Value", "A keyword in the keyword array was an empty string.", dataset_name, ) # modified # required if check_required_string_field(item, "modified", 1, dataset_name, errs): if ( not is_redacted(item["modified"]) and not MODIFIED_REGEX_1.match(item["modified"]) and not MODIFIED_REGEX_2.match(item["modified"]) and not MODIFIED_REGEX_3.match(item["modified"]) ): add_error( errs, 5, "Invalid Required Field Value", 'The field "modified" is not in valid format: "%s"' % item["modified"], dataset_name, ) # programCode # required if not is_redacted(item.get("programCode")): if check_required_field(item, "programCode", list, dataset_name, errs): for pc in item["programCode"]: if not isinstance(pc, (str, unicode)): add_error( errs, 5, "Invalid Required Field Value", "Each programCode in the programCode array must be a string", dataset_name, ) elif not PROGRAM_CODE_REGEX.match(pc): add_error( errs, 50, "Invalid Field Value (Optional Fields)", 'One of programCodes is not in valid format (ex. 018:001): "%s"' % pc, dataset_name, ) # publisher # required if check_required_field(item, "publisher", dict, dataset_name, errs): # publisher - name # required check_required_string_field(item["publisher"], "name", 1, dataset_name, errs) # Required-If-Applicable # dataQuality # Required-If-Applicable if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")): pass # not required or REDACTED elif not isinstance(item["dataQuality"], bool): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'dataQuality' must be true or false, " 'as a JSON boolean literal (not the string "true" or "false").', dataset_name, ) # distribution # Required-If-Applicable if item.get("distribution") is None: pass # not required elif not isinstance(item["distribution"], list): if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")): pass else: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'distribution' must be an array, if present.", dataset_name, ) else: for j, dt in enumerate(item["distribution"]): if isinstance(dt, (str, unicode)): if is_redacted(dt): continue distribution_name = dataset_name + (" distribution %d" % (j + 1)) # distribution - downloadURL # Required-If-Applicable check_url_field(False, dt, "downloadURL", distribution_name, errs, allow_redacted=True) # distribution - mediaType # Required-If-Applicable if "downloadURL" in dt: if check_required_string_field(dt, "mediaType", 1, distribution_name, errs): if not IANA_MIME_REGEX.match(dt["mediaType"]) and not is_redacted(dt["mediaType"]): add_error( errs, 5, "Invalid Field Value", 'The distribution mediaType "%s" is invalid. ' "It must be in IANA MIME format." % dt["mediaType"], distribution_name, ) # distribution - accessURL # optional check_url_field(False, dt, "accessURL", distribution_name, errs, allow_redacted=True) # distribution - conformsTo # optional check_url_field(False, dt, "conformsTo", distribution_name, errs, allow_redacted=True) # distribution - describedBy # optional check_url_field(False, dt, "describedBy", distribution_name, errs, allow_redacted=True) # distribution - describedByType # optional if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")): pass # not required or REDACTED elif not IANA_MIME_REGEX.match(dt["describedByType"]): add_error( errs, 5, "Invalid Field Value", 'The describedByType "%s" is invalid. ' "It must be in IANA MIME format." % dt["describedByType"], distribution_name, ) # distribution - description # optional if dt.get("description") is not None: check_required_string_field(dt, "description", 1, distribution_name, errs) # distribution - format # optional if dt.get("format") is not None: check_required_string_field(dt, "format", 1, distribution_name, errs) # distribution - title # optional if dt.get("title") is not None: check_required_string_field(dt, "title", 1, distribution_name, errs) # license # Required-If-Applicable check_url_field(False, item, "license", dataset_name, errs, allow_redacted=True) # rights # Required-If-Applicable # TODO move to warnings # if item.get("accessLevel") != "public": # check_string_field(item, "rights", 1, dataset_name, errs) # spatial # Required-If-Applicable # TODO: There are more requirements than it be a string. if item.get("spatial") is not None and not isinstance(item.get("spatial"), (str, unicode)): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'spatial' must be a string value if specified.", dataset_name, ) # temporal # Required-If-Applicable if item.get("temporal") is None or is_redacted(item.get("temporal")): pass # not required or REDACTED elif not isinstance(item["temporal"], (str, unicode)): add_error( errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be a string value if specified.", dataset_name, ) elif "/" not in item["temporal"]: add_error( errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be two dates separated by a forward slash.", dataset_name, ) elif ( not TEMPORAL_REGEX_1.match(item["temporal"]) and not TEMPORAL_REGEX_2.match(item["temporal"]) and not TEMPORAL_REGEX_3.match(item["temporal"]) ): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'temporal' has an invalid start or end date.", dataset_name, ) # Expanded Fields # accrualPeriodicity # optional if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES and not is_redacted( item.get("accrualPeriodicity") ): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'accrualPeriodicity' had an invalid value.", dataset_name, ) # conformsTo # optional check_url_field(False, item, "conformsTo", dataset_name, errs, allow_redacted=True) # describedBy # optional check_url_field(False, item, "describedBy", dataset_name, errs, allow_redacted=True) # describedByType # optional if item.get("describedByType") is None or is_redacted(item.get("describedByType")): pass # not required or REDACTED elif not IANA_MIME_REGEX.match(item["describedByType"]): add_error( errs, 5, "Invalid Field Value", 'The describedByType "%s" is invalid. ' "It must be in IANA MIME format." % item["describedByType"], dataset_name, ) # isPartOf # optional if item.get("isPartOf"): check_required_string_field(item, "isPartOf", 1, dataset_name, errs) # issued # optional if item.get("issued") is not None and not is_redacted(item.get("issued")): if not ISSUED_REGEX.match(item["issued"]): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'issued' is not in a valid format.", dataset_name, ) # landingPage # optional check_url_field(False, item, "landingPage", dataset_name, errs, allow_redacted=True) # language # optional if item.get("language") is None or is_redacted(item.get("language")): pass # not required or REDACTED elif not isinstance(item["language"], list): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' must be an array, if present.", dataset_name, ) else: for s in item["language"]: if not LANGUAGE_REGEX.match(s) and not is_redacted(s): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' had an invalid language: \"%s\"" % s, dataset_name, ) # PrimaryITInvestmentUII # optional if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")): pass # not required or REDACTED elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'PrimaryITInvestmentUII' must be a string " "in 023-000000001 format, if present.", dataset_name, ) # references # optional if item.get("references") is None: pass # not required or REDACTED elif not isinstance(item["references"], list): if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")): pass else: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' must be an array, if present.", dataset_name, ) else: for s in item["references"]: if not rfc3987_url.match(s) and not is_redacted(s): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' had an invalid rfc3987 URL: \"%s\"" % s, dataset_name, ) if len(item["references"]) != len(set(item["references"])): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' has duplicates", dataset_name, ) # systemOfRecords # optional check_url_field(False, item, "systemOfRecords", dataset_name, errs, allow_redacted=True) # theme #optional if item.get("theme") is None or is_redacted(item.get("theme")): pass # not required or REDACTED elif not isinstance(item["theme"], list): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.", dataset_name, ) else: for s in item["theme"]: if not isinstance(s, (str, unicode)): add_error( errs, 50, "Invalid Field Value (Optional Fields)", "Each value in the theme array must be a string", dataset_name, ) elif len(s.strip()) == 0: add_error( errs, 50, "Invalid Field Value (Optional Fields)", "A value in the theme array was an empty string.", dataset_name, ) # Form the output data. for err_type in sorted(errs): errors_array.append( ( err_type[1], # heading [ err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "") for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x)) ], ) )
def is_valid_url(a_string): """Check if a string is a valid URL.""" return rfc3987.match(a_string, 'URI') is not None
def run(repo): log.info("Request to: " + request.base_url) log.info("workspace:" + workspace) repopath = posixpath.join(workspace, repo) log.info("repopath:" + repopath) log.info("Running " + repo + " repository in " + workspace) log.info(repopath) if not status(repopath): log.debug("Repository " + repo + " doesnt exist in " + repopath) abort(404) else: log.info("Repository size: " + size(repopath)) graph = request.args.get('graph', '') branch = request.args.get('branch', '') checkoutBranch(branch, repopath) if graph == "": filelist = lsfiles(repopath) return render_template('list.html', data=pathsToURIs(repo, filelist)) elif not match(graph, rule='absolute_IRI'): log.debug("Graph URI is not valid") abort(406) if notSupportedContentType(request): abort(412) fileGraph = FileGraph(graph, domain, repopath, repo) if request.method == 'GET': if fileGraph.doExists(): return send_file(fileGraph.filepath, 'text/turtle') else: abort(404) elif request.method == 'POST': #TODO: Abort on history branches. Create list of actual branches? log.debug("POST to " + fileGraph.iri) if fileGraph.doExists(): fileGraph.parsePath() fileGraph.parseString(request.data.decode('utf-8')) fileGraph.serialize() autoAddAndCommit(fileGraph, "POST - Adding graph " + fileGraph.iri) return loglast(repopath) elif request.method == 'PUT': #TODO: Abort on history branches log.debug("PUT to " + fileGraph.iri) fileGraph.parseString(request.data.decode('utf-8')) fileGraph.serialize() autoAddAndCommit(fileGraph, "PUT - Writing graph " + fileGraph.iri) return loglast(repopath) elif request.method == 'DELETE': #TODO: Abort on history branches log.debug("DELETE to " + fileGraph.iri) if fileGraph.doExists(): deleteFile(fileGraph.filepath) autoAddAndCommit(fileGraph, "DELETE - Removing graph " + fileGraph.iri) return ('', 204) else: log.debug("GRAPH " + fileGraph.iri + " DOES NOT EXIST in " + fileGraph.filepath) abort(404) else: abort(406)
def __init__(self, ref_url, stop_words=None, late_kills=None): """ Arguments: ---------- ref_url : str Web page from which search terms are to be extracted stop_words : sequence or set List or set with common words to be excluded from search string. `stop_list` will be applied *before* any multiple word phrases are constructed. late_kills : sequence or set Like `stop_words` but the words in `late_kills` will only be eliminated from the search string *after* multiple word phrases have been constructed. Thus you can have a word like 'report' appear in the search string as part of a multiple word phrase ("OECD Report on Public Health") but not as a single word (which would have almost zero selectivity for a news article). """ if not rfc3987.match(ref_url, rule='URI_reference'): raise InvalidUrlError(ref_url) # check if url points to non-html: p_url = urlparse(ref_url) if p_url[2].endswith(tuple(WebArticle.exclude_formats)): raise ArticleFormatError(p_url[2]) self._stop_words = frozenset(stop_words) if stop_words else [] self._late_kills = frozenset(late_kills) if late_kills else [] self.url = ref_url try: result = requests.get(ref_url, headers={'User-Agent': random.choice(REF.user_agents)}, proxies=_get_proxies()) except requests.exceptions.RequestException: raise PageRetrievalError(ref_url) if not result.status_code == requests.codes.ok: raise PageRetrievalError(ref_url) ht = result.text encoding = result.encoding if result.encoding else 'utf-8' ht = ht.encode(result.encoding) if isinstance(ht, unicode) else ht # need to parse only to check for excessive number of headings parsed = html.fromstring(ht) if max([len(parsed.xpath('//h{0}'.format(i+1))) for i in xrange(4)]) > WebArticle.max_headings: logging.debug("too many headings in %s, raising exception", ref_url) raise NotAnArticleError(ref_url) # now get the article content g = Goose() try: article = g.extract(raw_html=ht) except ValueError: logging.debug("could not extract article from %s" % ref_url) raise ArticleExtractionError(ref_url) self.title = article.title self.text = article.cleaned_text if not self.text: logging.debug("could not extract article from %s" % ref_url) raise ArticleExtractionError(ref_url) self.wlist = build_wlist(self.text) self.wcount = len(self.wlist) logging.debug("built %d word list for article \"%s\"" % (self.wcount, self.title)) sl = [WebArticle.stemmer.stem(w) for w in self.wlist if w not in self._stop_words.union(self._late_kills) and len(w) > 2] self.stem_tops = Counter(sl).most_common()
def analyze(self, graph): """Analysis of SKOS concepts and related properties presence in a dataset.""" log = logging.getLogger(__name__) concept_count = dict() schemes_count = dict() top_concept = dict() concepts = [ row['concept'] for row in graph.query(""" SELECT DISTINCT ?concept WHERE { ?concept a <http://www.w3.org/2004/02/skos/core#Concept>. } """) ] for c in concepts: if not rfc3987.match(c): log.debug(f'{c} is a not valid IRI') continue for row in graph.query(SkosAnalyzer._count_query(c)): concept_count[c] = row['count'] schemes = [ row['scheme'] for row in graph.query(""" SELECT DISTINCT ?scheme WHERE { OPTIONAL {?scheme a <http://www.w3.org/2004/02/skos/core#ConceptScheme>.} OPTIONAL {?_ <http://www.w3.org/2004/02/skos/core#inScheme> ?scheme.} } """) ] for schema in schemes: if not rfc3987.match(schema): log.debug(f'{schema} is a not valid IRI') continue for row in graph.query( SkosAnalyzer._scheme_count_query(str(schema))): schemes_count[schema] = row['count'] for schema in schemes: if not rfc3987.match(schema): continue top_concept[schema] = [ row['concept'] for row in graph.query( SkosAnalyzer._scheme_top_concept(str(schema))) ] collections = [ row['coll'] for row in graph.query(""" SELECT DISTINCT ?coll WHERE { OPTIONAL { ?coll a <http://www.w3.org/2004/02/skos/core#Collection>. } OPTIONAL { ?coll a <http://www.w3.org/2004/02/skos/core#OrderedCollection>. } OPTIONAL { ?a <http://www.w3.org/2004/02/skos/core#member> ?coll. } OPTIONAL { ?coll <http://www.w3.org/2004/02/skos/core#memberList> ?b. } } """) ] ord_collections = [ row['coll'] for row in graph.query(""" SELECT DISTINCT ?coll WHERE { ?coll a <http://www.w3.org/2004/02/skos/core#OrderedCollection>. } """) ] return { 'concept': concept_count, 'schema': schemes_count, 'topConcepts': top_concept, 'collection': collections, 'orderedCollection': ord_collections }
# -*- coding: utf-8 -*- import sys; import os; import time; import re; import tempfile; import urllib; # import urllib.request; import rfc3987; dir = tempfile.gettempdir() + "/lview"; # os.makedirs(dir, exist_ok=True); if (not os.path.exists(dir)): os.makedirs(dir); for arg in sys.argv[1:]: if (rfc3987.match(arg, "URI")): h = rfc3987.parse(arg, "URI"); filename = os.path.basename(h["path"]); ext = os.path.splitext(filename)[1]; if (ext.lower() in ( ".bmp", ".jpg", ".jpeg", ".png", ".gif" )): response = urllib.urlopen(arg); # response = urllib.request.urlopen(arg); filename = str(int(time.time())) + "_" + os.path.basename(h["path"]); fpw = open(dir + "/" + filename, "wb"); fpw.write(response.read()); fpw.close();
def _dcat_extractor(g, red, log): distributions, distributions_priority = [], [] endpoints = [] dcat = Namespace('http://www.w3.org/ns/dcat#') dcterms = Namespace('http://purl.org/dc/terms/') nkod = Namespace('https://data.gov.cz/slovník/nkod/mediaTyp') media_priority = set([ 'https://www.iana.org/assignments/media-types/application/rdf+xml', 'https://www.iana.org/assignments/media-types/application/trig', 'https://www.iana.org/assignments/media-types/text/n3', 'https://www.iana.org/assignments/media-types/application/ld+json', 'https://www.iana.org/assignments/media-types/application/n-triples', 'https://www.iana.org/assignments/media-types/application/n-quads', 'https://www.iana.org/assignments/media-types/text/turtle' ]) #IANA format_priority = set([ 'http://publications.europa.eu/resource/authority/file-type/RDF', 'http://publications.europa.eu/resource/authority/file-type/RDFA', 'http://publications.europa.eu/resource/authority/file-type/RDF_N_QUADS', 'http://publications.europa.eu/resource/authority/file-type/RDF_N_TRIPLES', 'http://publications.europa.eu/resource/authority/file-type/RDF_TRIG', 'http://publications.europa.eu/resource/authority/file-type/RDF_TURTLE', 'http://publications.europa.eu/resource/authority/file-type/RDF_XML', 'http://publications.europa.eu/resource/authority/file-type/JSON_LD', 'http://publications.europa.eu/resource/authority/file-type/N3' ]) #EU queue = distributions log.info("Extracting distributions") #DCAT dataset dsdistr, distrds = ds_distr() with red.pipeline() as pipe: for ds in g.subjects(RDF.type, dcat.Dataset): #dataset titles (possibly multilang) for t in g.objects(ds, dcterms.title): key = ds_title(ds, t.language) red.set(key, t.value) #DCAT Distribution for d in g.objects(ds, dcat.distribution): # put RDF distributions into a priority queue for media in g.objects(d, dcat.mediaType): if str(media) in media_priority: queue = distributions_priority for format in g.objects(d, dcterms.format): if str(format) in format_priority: queue = distributions_priority # data.gov.cz specific for format in g.objects(d, nkod.mediaType): if 'rdf' in str(format): queue = distributions_priority # download URL to files for downloadURL in g.objects(d, dcat.downloadURL): if rfc3987.match(str(downloadURL)): log.debug( f'Distribution {downloadURL!s} from DCAT dataset {ds!s}' ) queue.append(downloadURL) pipe.hset(dsdistr, str(ds), str(downloadURL)) pipe.hset(distrds, str(downloadURL), str(ds)) else: log.warn(f'{access!s} is not a valid download URL') # scan for DCAT2 data services here as well for access in g.objects(d, dcat.accessURL): for endpoint in g.objects(access, dcat.endpointURL): if rfc3987.match(str(endpoint)): log.debug( f'Endpoint {endpoint!s} from DCAT dataset {ds!s}' ) endpoints.append(endpoint) pipe.hset(dsdistr, str(ds), str(endpoint)) pipe.hset(distrds, str(endpoint), str(ds)) else: log.warn(f'{endpoint!s} is not a valid endpoint URL') pipe.sadd('purgeable', dsdistr, distrds) # TODO: expire pipe.execute() # TODO: possibly scan for service description as well tasks = [process_priority.si(a) for a in distributions_priority] tasks.extend(process_endpoint.si(e) for e in endpoints) tasks.extend(process.si(a) for a in distributions) return group(tasks).apply_async()
def _validate_type_url(self, value): """ Enables validation for json objects """ if match(value, rule='URI'): return True
def __init__(self, url): if (rfc3987.match(url, 'URI') == None or not re.match("^http(s)?://.*", url)): raise ValueError('Invalid URL') else: self.url = url self.__getStaStatus(self.__getMessageFromStaRss())
def isValidURL(untestedStr): return match(untestedStr, rule="IRI") is not None
def is_type(cls, other): if type(other) is str: return rfc3987.match(other, rule='URI') else: return isinstance(other, HTMLURI)