Exemplo n.º 1
0
def api_analyze_catalog():
    """Analyze a catalog."""
    if 'token' not in session:
        session['token'] = str(uuid.uuid4())

    iri = request.args.get('sparql', None)
    graph = request.args.get('graph', None)
    if iri is not None and graph is not None and rfc3987.match(
            iri) and rfc3987.match(graph):
        current_app.logger.info(
            f'Analyzing endpoint {iri}, named graph {graph}')
        red = redis.Redis(connection_pool=redis_pool)

        #Throttling
        key = f'batch:{session["token"]}'
        queueLength = red.scard(key)
        while queueLength > 1000:
            current_app.logger.warning(f'Queue length: {queueLength}')
            time.sleep(60)
            queueLength = red.scard(key)

        t = inspect_graph.si(iri, graph).apply_async()
        current_app.logger.info(
            f'Batch id: {session["token"]}, task id: {t.id}')
        red.hset('taskBatchId', t.id, session["token"])
        return ''
    else:
        abort(400)
def get_generate_uri(source: dict, entity: str, data: dict) -> str:
    """
    Get/Generate URI from BrAPI object or generate one
    """
    pui_field = entity + 'PUI'
    data_uri = data.get(pui_field)

    if data_uri and rfc3987.match(data_uri, rule='URI'):
        # The original PUI is a valid URI
        return data_uri

    source_id = urllib.parse.quote(source['schema:identifier'])
    data_id = get_identifier(entity, data)
    if not data_uri:
        # Generate URI from source id, entity name and data id
        encoded_entity = urllib.parse.quote(entity)
        encoded_id = urllib.parse.quote(data_id)
        data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
    else:
        # Generate URI by prepending the original URI with the source identifier
        encoded_uri = urllib.parse.quote(data_uri)
        data_uri = f"urn:{source_id}/{encoded_uri}"

    if not rfc3987.match(data_uri, rule='URI'):
        raise Exception(
            f'Could not get or create a correct URI for "{entity}" object id "{data_id}"'
            f' (malformed URI: "{data_uri}")')
    return data_uri
Exemplo n.º 3
0
 def validAllowList(self, allowlist):
     if not self.is_type(allowlist):
         raise ValueError()
     for origin in allowlist:
         if not origin in HTMLFeaturePolicy.valid_origins or rfc3987.match(origin, rule='URI'):
             return False
     return True
Exemplo n.º 4
0
def is_valid_url(a_string):
    """Check if a string is a valid URL."""
    match_obj = rfc3987.match(a_string, 'URI')
    if match_obj:
        return True
    else:
        return False
Exemplo n.º 5
0
def ExtractTextFeatures():
    uri2label = GetEList()
    id2text = open("../dataset/Graph_ID_Text.txt", "w", encoding="utf-8")
    NodeID = open("../dataset/Graph_Origin_Node_ID.txt", "r", encoding="utf-8")
    count = 0
    for line in NodeID.readlines():
        uri, id = line.strip().split("\t\t")
        text = ""
        if match(uri, rule='IRI_reference') is None: text = uri
        elif uri in uri2label.keys(): text = uri2label[uri]
        elif len(uri.split("#")) > 1:
            text = re.sub(r"(\w)([A-Z])", r"\1 \2", (uri.split("#"))[-1])
        else:
            g = Graph()
            try:
                g.parse(uri)
                text = g.label(URIRef(uri))
            except:
                print("不能解析!")
            if text == "":
                uri_s = uri.split("resource/")
                if len(uri_s) > 1: text = uri_s[-1]
                else: text = (uri.split("/"))[-1]
        id2text.write(id + "\t\t" + text + "\n")
        print(count)
        count += 1
    id2text.close()
    NodeID.close()
Exemplo n.º 6
0
def ExtractTextFeatures():
    id2text = open("../dataset/FGraph_ID_Text.txt", "a", encoding="utf-8")
    NodeID = open("../dataset/FGraph_Origin_Node_ID.txt",
                  "r",
                  encoding="utf-8")
    count = 0
    for line in NodeID.readlines():
        if count < 36661:
            count += 1
            continue
        if count >= 81000: break
        uri, id = line.strip().split("\t\t")
        text = ""
        if match(uri, rule='IRI_reference') is None: text = uri
        elif len(uri.split("#")) > 1:
            text = re.sub(r"(\w)([A-Z])", r"\1 \2", (uri.split("#"))[-1])
        else:
            g = Graph()
            try:
                g.parse(uri)
                text = g.label(URIRef(uri))
            except:
                print("不能解析!")
            if text == "":
                uri_s1 = uri.split("resource/")
                if len(uri_s1) > 1: text = uri_s1[-1]
                else:
                    uri_s2 = (uri.split("/"))[-1]
                    if uri_s2 != "": text = uri_s2
                    else: text = uri
        id2text.write(id + "\t\t" + text + "\n")
        print(count)
        count += 1
    id2text.close()
    NodeID.close()
Exemplo n.º 7
0
Arquivo: wburl.py Projeto: rajbot/pywb
    def __init__(self, url):
        self.original_url = url
        self.type = None
        self.url = ''
        self.timestamp = ''
        self.mod = ''

        if not any (f(url) for f in [self._init_query, self._init_replay]):
            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)

        if len(self.url) == 0:
            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)

        # protocol agnostic url -> http://
        #if self.url.startswith('//'):
        #    self.url = self.DEFAULT_SCHEME + self.url[2:]
        # no protocol -> http://
        if not '://' in self.url:
            self.url = self.DEFAULT_SCHEME + self.url

        # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
        # %2F is fine, but %2f -- standard supports either
        matcher = rfc3987.match(self.url.upper(), 'IRI')

        if not matcher:
            raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
Exemplo n.º 8
0
def ask_source():
    while True:
        source = input("Enter the template git repository: ")
        if match(source, rule='IRI'):
            return source
        print(f"Please try another git repository than {source}.")
    return ""
Exemplo n.º 9
0
 def _is_valid(self, instance) -> bool:
     # Something is considered to be an instance of FullIRI iff:
     #   It isn't an RDFLib Literal -- you can never cross those beams
     #   It is already declated to be a URIRef no matter what it looks like
     #   It looks like an IRI
     return instance is not None and not isinstance(instance, (rdflib.Literal, Literal)) \
            and (isinstance(instance, URIRef) or rfc3987.match(str(instance), 'IRI'))
Exemplo n.º 10
0
    def __init__(self, label, src, wf_name):
        # a label to reference this input by in other parts of the wf file.
        self.label = label

        # src can either be:
        # 1) URI
        # 2) absolute path on the host
        # 3) relative path on the host, relative to the CWD of eod
        # In the FUTURE: should also allow
        # 4) a global output of another eod.yml, in which case it should be <wf_name>.<outputs>.<label>
        # (this is not yet supported)

        self.src = src

        self.is_uri = False
        if rfc3987.match(self.src, 'URI'):
            self.is_uri = True
            self.uri = self.src

        self.set_abs_host_path(wf_name)
        if not self.abs_host_path:
            raise Error("Could not compute host path for src:{}, label:{}".format(src, label))
        self.eod_container_path = to_eod(self.abs_host_path)
        base_dir = os.path.dirname(self.eod_container_path)
        # make sure the parent directory exists,
        if not os.path.exists(base_dir):
            print("Creating global input base_dir:{}".format(base_dir))
            os.makedirs(base_dir)
        # create a file with the URI
        if self.is_uri:
            with open(self.eod_container_path, 'w') as f:
                print(self.uri, file=f)
Exemplo n.º 11
0
def assemble_command(arguments, statistics, nicknames):
    streamer = arguments.streamer

    if match(streamer, 'absolute_URI'):
        url = streamer
        statistics.add_usage(url)
    elif nicknames.find(streamer):
        url = nicknames.get(streamer)
    else:
        print("Nickname", streamer, "has not been defined yet", file=stderr)
        return 1

    statistics.save()
    player_command = ''
    if arguments.mode == 'video':
        quality = 'best'
        player_command = '--file-caching=10000'
    elif url.find('twitch') >= 0:
        quality = 'audio'
    else:
        quality = 'worst'
        player_command = '--novideo'

    exec_string = "streamlink " + "--player \"'C:\\Program Files\\VideoLAN\\VLC\\vlc.exe' " + player_command + "\"" \
                  + ' ' + url + ' ' + quality

    if arguments.dry_run:
        print("The resulting command string:")
        print("[", exec_string, "]")
        return 0
    else:
        print('The real execution starts here')
        return call(exec_string, shell=False)
Exemplo n.º 12
0
    def POST(self):

        if not urlForm.validates():
            return render.index(urlForm)
        form_data = web.input()

        check_spec = True if 'checkspec' in form_data else False

        if rfc3987.match(form_data.URL, rule='URI_reference'):
            parsed = urlparse.urlparse(form_data['URL'], scheme='http')
            ref_url = parsed.scheme + '://' + '/'.join([parsed.netloc,
                    parsed.path.lstrip('/')]).lstrip('/')
        else:
            ref_url = None

        if ref_url:
            params = {'url': ref_url,
                      'checkspec': int(check_spec)}
            raise web.seeother('/request?' + urlencode(params), '/')
        else:
            params = {
                    'msg': "Sorry, not implememted yet...",
                    'back_link': '/'
            }
            raise web.seeother('/error?' + urlencode(params))
Exemplo n.º 13
0
def iri(instance):
    try:
        import rfc3987
    except ImportError:
        logging.warning(
            "package rfc3987 missing - cannot validate iri - so it passes")
        return True
    return rfc3987.match(instance, rule="IRI")
Exemplo n.º 14
0
    def validate_iri(value):

        if value is None or rfc3987.match(value, rule='IRI') is not None:

            return True

        else:

            return False
Exemplo n.º 15
0
def uri_reference(instance):
    try:
        import rfc3987
    except ImportError:
        logging.warning(
            "package rfc3987 missing - cannot validate uri-reference - so it passes"
        )
        return True
    return rfc3987.match(instance, rule="URI_reference")
def uri_validator(node, uri):
    """
    URL validator rfc3987 is used to check if a URL is correct (https://pypi.python.org/pypi/rfc3987/).

    :param node: The schema node to which this exception relates.
    :param uri: Uri to validate.
    """
    if not rfc3987.match(uri, rule='URI'):
        raise colander.Invalid(node, '{0} is geen geldige URI.'.format(uri))
Exemplo n.º 17
0
 def setupsubject(self, sub):
     logger.info('INFO add_triple.py - setup subject %s', sub)
     try:
         if str(rfc3987.match(sub, rule='URI')) != 'None':
             self.sub = sub
         else:
             raise Exception("This is not a valid URI")
     except Exception as e:
         logger.error("ERROR! add_triple.py - " + e.message)
Exemplo n.º 18
0
def uri_validator(node, uri):
    """
    URL validator rfc3987 is used to check if a URL is correct (https://pypi.python.org/pypi/rfc3987/).

    :param node: The schema node to which this exception relates.
    :param uri: Uri to validate.
    """
    if not rfc3987.match(uri, rule='URI'):
        raise colander.Invalid(node, '{0} is geen geldige URI.'.format(uri))
Exemplo n.º 19
0
def is_uri(uri):
    '''
    Check if a string is a valid URI according to rfc3987

    :param string uri:
    :rtype: boolean
    '''
    if uri is None:
        return False
    return rfc3987.match(uri, rule='URI')
Exemplo n.º 20
0
def is_uri(uri):
    '''
    Check if a string is a valid URI according to rfc3987

    :param string uri:
    :rtype: boolean
    '''
    if uri is None:
        return False
    return rfc3987.match(uri, rule='URI')
Exemplo n.º 21
0
    def adduri(self, p, o):

        try:
            s = self.conn.createURI(self.sub)
            if str(rfc3987.match(p, rule='URI')) != 'None':
                p = self.conn.createURI(p)
            else:
                raise Exception("This is not a valid URI")
            if str(rfc3987.match(o, rule='URI')) != 'None':
                o = self.conn.createURI(o)
            else:
                raise Exception("This is not a valid URI")
            logger.info("INFO add_triple.py - add uri <%s> %s %s", self.sub, p, o)
            if len(self.contexts) > 0:
                self.conn.add(s, p, o, contexts=self.contexts)
            else:
                self.conn.add(s, p, o)
        except Exception as e:
            logger.error("ERROR! add_triple.py - " + e.message)
Exemplo n.º 22
0
def parse_streamer_url(url, nicknames):
    if match(url, 'absolute_URI'):
        rv1 = [x for x in parse(url)['path'].split('/') if x][-1]
        rv2 = url
        return rv1, rv2
    elif nicknames.find(url):
        return url, nicknames.get(url)
    else:
        print("Nickname \"{0}\" has not been defined yet".format(url))
        return None, None
def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False):
    # checks that a required or optional field, if specified, looks like a URL
    if not required and (field_name not in obj or obj[field_name] is None): return True  # not required, so OK
    if not check_required_field(obj, field_name, (str, unicode), dataset_name,
                                errs): return False  # just checking data type
    if allow_redacted and is_redacted(obj[field_name]): return True
    if not rfc3987_url.match(obj[field_name]):
        add_error(errs, 5, "Invalid Required Field Value",
                  "The '%s' field has an invalid rfc3987 URL: \"%s\"." % (field_name, obj[field_name]), dataset_name)
        return False
    return True
Exemplo n.º 24
0
def link_title_parse_hook(bot, channel, sender, message):
    for word in message.split(" "):
        try:
            if rfc3987.match(word, rule='URI'):
                r = requests.get(word)
                soup = BeautifulSoup(r.text, 'html.parser')
                title = soup.head.title.text.strip()
                title = title.replace("\n", "")
                bot.message(channel, " :: {}".format(title))
        except requests.exceptions.InvalidSchema:
            pass
Exemplo n.º 25
0
def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False):
    # checks that a required or optional field, if specified, looks like a URL
    if not required and (field_name not in obj or obj[field_name] is None): return True  # not required, so OK
    if not check_required_field(obj, field_name, (str, unicode), dataset_name,
                                errs): return False  # just checking data type
    if allow_redacted and is_redacted(obj[field_name]): return True
    if not rfc3987_url.match(obj[field_name]):
        add_error(errs, 5, "Invalid Required Field Value",
                  "The '%s' field has an invalid rfc3987 URL: \"%s\"." % (field_name, obj[field_name]), dataset_name)
        return False
    return True
Exemplo n.º 26
0
def get_uri(source, entity_name, object):
    """Get URI from BrAPI object or generate one"""
    pui_field = entity_name + 'PUI'
    object_uri = object.get(pui_field)

    if object_uri and rfc3987.match(object_uri, rule='URI'):
        # The original URI is valid
        return object_uri

    source_id = source['schema:identifier']
    object_id = get_identifier(entity_name, object)
    if not object_uri:
        object_uri = get_uri_by_id(source, entity_name, object_id)
    else:
        # Generate URI by prepending the original URI with the source identifier
        object_uri = 'urn:{}/{}'.format(source_id, urllib.parse.quote(object_uri, safe=''))

    if not rfc3987.match(object_uri, rule='URI'):
        raise Exception('Could not get or create a correct URI for "{}" object id "{}" (malformed URI: "{}")'
                        .format(entity_name, object_id, object_uri))

    return object_uri
Exemplo n.º 27
0
 def _validate_stringformat(self, stringtype, field, value):
     """ {'allowed': ['url', 'json']} """
     if stringtype == 'json':
         try:
             json.loads(value)
             return True
         except ValueError:
             self._error(field, STRINGFORMAT_JSON)
     elif stringtype == 'url':
         if match(value, rule='URI'):
             return True
         else:
             self._error(field, STRINGFORMAT_URL)
Exemplo n.º 28
0
    def startAuthenticationProcess(self):
        if self.settingsDialog.group_account.input_url.text and self.settingsDialog.group_account.input_username.text and self.settingsDialog.group_account.input_password.text:
            self.saveSettings()
            self.loadSettings()
            if match(self.url, "URI"):
                try:
                    request = requests.get(self.url, timeout=3);

                    if request.status_code == 200:
                        oc = owncloud.Client(self.url)
                        oc.login(self.username, self.password)
                        self.connectStatus = "true"
                        self.saveSettings()
                        self.updateUi()
                except requests.exceptions.RequestException as e:
                    QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The specified Server URL is invalid!")
                    settings = QSettings()
                    settings.remove("connect-status")
                    self.saveSettings()
                    self.updateUi()
                except Exception as e:
                    errorMessage = self.formatConnectionError(e.message)

                    if errorMessage == "401":
                        self.settingsDialog.group_connection.widget_status.label_status.setText("Invalid")
                    else:
                        QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", errorMessage)
            else:
                QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The specified Server URL is invalid!")
        else:
            missingFields = ""
            fieldText = "field"

            if not self.settingsDialog.group_account.input_url.text:
                missingFields = "\"Server URL\""

            if not self.settingsDialog.group_account.input_username.text:
                if missingFields == "":
                    missingFields = "\"Username\""
                else:
                    missingFields = missingFields + " and \"Username\""
                    fieldText = "fields"

            if not self.settingsDialog.group_account.input_password.text:
                if missingFields == "":
                    missingFields = "\"Password\""
                else:
                    missingFields = missingFields.replace(" and", ",") + " and \"Password\""
                    fieldText = "fields"

            QMessageBox.critical(self.settingsDialog, "OwnCloud Connection Error", "The " + missingFields + " " + fieldText + " must be filled in!")
Exemplo n.º 29
0
Arquivo: aurl.py Projeto: nlevitt/pywb
    def __init__(self, url):
        self.original_url = url
        self.type = None
        self.url = None
        self.timestamp = None
        self.mod = None

        if not any(f(self, url) for f in [aurl._init_query, aurl._init_replay]):
            raise RequestParseException("Invalid WB Request Url: " + url)

        matcher = rfc3987.match(self.url, "IRI_reference")

        if not matcher:
            raise RequestParseException("Bad Request Url: " + self.url)
Exemplo n.º 30
0
def create_reference(md5, pos):
    """
    Adds a reference notation to an upload
    """
    url = request.args.get('url', '')
    if not rfc3987.match(url, rule='URI'):
        return "Sorry, that's not a valid URL:\n" % url
    u = Upload.objects.filter(md5=md5).first()
    if not u:
        abort(404)
    # Create the reference
    r = Reference(upload=u, ref_url=url, pos=pos)
    r.save()

    return url
Exemplo n.º 31
0
 def addliteral(self, p, o):
     try:
         s = self.conn.createURI(self.sub)
         if str(rfc3987.match(p, rule='URI')) != 'None':
             p = self.conn.createURI(p)
         else:
             raise Exception("This is not a valid URI")
         o = self.conn.createLiteral(o)
         logger.debug("DEBUG add_triple.py - add literal <%s> %s '%s'", self.sub, p, o)
         if len(self.contexts) > 0:
             self.conn.add(s, p, o, contexts=self.contexts)
         else:
             self.conn.add(s, p, o)
     except Exception as e:
         logger.error("ERROR! add_triple.py - " + e.message)
Exemplo n.º 32
0
def create_reference(md5, pos):
    """
    Adds a reference notation to an upload
    """
    url = request.args.get('url', '')
    if not rfc3987.match(url, rule='URI'):
        return "Sorry, that's not a valid URL:\n" % url
    u = Upload.objects.filter(md5=md5).first()
    if not u:
        abort(404)
    # Create the reference
    r = Reference(upload=u, ref_url=url, pos=pos)
    r.save()

    return url
Exemplo n.º 33
0
def download_image(url):
    # Skip empty strings and strings that are not absolute urls.
    if not url or not rfc3987.match(url, 'absolute_URI'):
        return None, None
    # Get url.
    try:
        response = requests.get(url)
    except requests.exceptions.ConnectionError:
        return None, None
    if response.status_code == 200:
        image = BytesIO(response.content)
        # Check that the file is really an image.
        image_type = imghdr.what(image)
        if image_type:
            return ImageFile(image), image_type
    return None, None
Exemplo n.º 34
0
def link_title_parse_hook(bot, channel, sender, message):
    if not allowed_to_process(bot, channel):
        return

    for word in message.split(" "):
        try:
            if rfc3987.match(word, rule='URI'):
                r = requests.get(word)

                if r.status_code != 200:
                    return

                soup = BeautifulSoup(r.text, 'html.parser')
                title = soup.head.title.text.strip()
                title = title.replace("\n", "")

                bot.message(channel, " :: {}".format(title))
        except requests.exceptions.InvalidSchema:
            pass
Exemplo n.º 35
0
    def startAuthenticationProcess(self):
        if self.settingsDialog.group_account.input_url.text and self.settingsDialog.group_account.input_username.text and self.settingsDialog.group_account.input_password.text:
            self.saveSettings()
            self.loadSettings()
            if match(self.url, "URI"):
                try:
                    request = requests.get(self.url, timeout=3)

                    if request.status_code == 200:
                        oc = nextcloud.Client(self.url)
                        oc.login(self.username, self.password)
                        self.connectStatus = "true"
                        self.saveSettings()
                        self.updateUi()
                except requests.exceptions.RequestException as e:
                    QMessageBox.critical(
                        self.settingsDialog, "NextCloud Connection Error",
                        "The specified Server URL is invalid!")
                    settings = QSettings()
                    settings.remove("connect-status")
                    self.saveSettings()
                    self.updateUi()
                except Exception as e:
                    errorMessage = self.formatConnectionError(e.message)

                    if errorMessage == "401":
                        self.settingsDialog.group_connection.widget_status.label_status.setText(
                            "Invalid")
                    else:
                        QMessageBox.critical(self.settingsDialog,
                                             "NextCloud Connection Error",
                                             errorMessage)
            else:
                QMessageBox.critical(self.settingsDialog,
                                     "NextCloud Connection Error",
                                     "The specified Server URL is invalid!")
        else:
            missingFields = ""
            fieldText = "field"

            if not self.settingsDialog.group_account.input_url.text:
                missingFields = "\"Server URL\""

            if not self.settingsDialog.group_account.input_username.text:
                if missingFields == "":
                    missingFields = "\"Username\""
                else:
                    missingFields = missingFields + " and \"Username\""
                    fieldText = "fields"

            if not self.settingsDialog.group_account.input_password.text:
                if missingFields == "":
                    missingFields = "\"Password\""
                else:
                    missingFields = missingFields.replace(
                        " and", ",") + " and \"Password\""
                    fieldText = "fields"

            QMessageBox.critical(
                self.settingsDialog, "NextCloud Connection Error", "The " +
                missingFields + " " + fieldText + " must be filled in!")
Exemplo n.º 36
0
    def triple_value(value):
        """ This function takes as input the predicate's value and returns it in the write format, be it a
            integer, decimal, double, boolean, date, time or dateTime datatype or whether it is a URI """

        # Check whether the value is null or empty
        if value is None:
            return ""
        else:
            value = value.strip()

        # Return an empty string if the value is an empty string
        if value == "":
            return ""

        if value == "\\":
            value += "\\"

        # Replace double quote with a single quote
        value = to_unicode(value)
        value = value.replace('"', "'")

        # URI values
        if ("http://" in value or "https://" in value) and " " not in value:
            if match(value) is not None:
                return to_unicode(u"<{0}>".format(value))
            elif re.search("[“”’`\r\n'\"]+", value, re.IGNORECASE):
                return to_unicode(u"\"\"\"{0}\"\"\"".format(value))
            else:
                return to_unicode(u"\"{0}\"".format(value))  # ^^xsd:string

        # NUMBERS: can be written like other literals with lexical form and datatype
        elif RDF.rgxInteger.match(value):
            return u"\"{0}\"^^xsd:integer".format(value)

        elif RDF.rgxDecimal.match(value):
            return u"\"{0}\"^^xsd:decimal".format(value)

        elif RDF.rgxDouble.match(value):
            return u"\"{0}\"^^xsd:double".format(value)

        # BOOLEAN: values may be written as either 'true' or 'false' (case-sensitive)
        # and represent RDF literals with the datatype xsd:boolean. """
        elif value == "true" or value == "false":
            return u"\"{0}\"^^xsd:boolean".format(value)

        # DATE: specified in the following form "YYYY-MM-DD"
        # Note: All components are required!
        elif RDF.rgxDate.match(value):
            return u"\"{0}\"^^xsd:date".format(value)

        # TIME:
        elif RDF.rgxTime.match(value):
            return u"\"{0}\"^^xsd:time".format(value)

        # DATE - TIME:
        elif RDF.rgxDateTime.match(value):
            return u"\"{0}\"^^xsd:dateTime".format(value)

        # TEXT \u005c
        # ^^xsd:string
        elif re.search("[“”’`\r\n'\"]+", value, re.IGNORECASE):
            return to_unicode(u"\"\"\"{0}\"\"\"".format(value).replace(
                u"\\", u"\\\\"))

        else:
            # ^^xsd:string
            return to_unicode(u"\"{0}\"".format(value)).replace(u"\\", u"\\\\")
def do_validation(doc, errors_array, seen_identifiers):
    errs = {}

    if type(doc) != list:
        add_error(
            errs, 0, "Bad JSON Structure",
            "The file must be an array at its top level. "
            "That means the file starts with an open bracket [ and ends with a close bracket ]."
        )
    elif len(doc) == 0:
        add_error(errs, 0, "Catalog Is Empty",
                  "There are no entries in your file.")
    else:
        for i, item in enumerate(doc):
            # Required

            dataset_name = "dataset %d" % (i + 1)

            # title
            if check_required_string_field(item, "title", 1, dataset_name,
                                           errs):
                dataset_name = '"%s"' % item.get("title", "").strip()

            # accessLevel # required
            if check_required_string_field(item, "accessLevel", 3,
                                           dataset_name, errs):
                if item["accessLevel"] not in ("public", "restricted public",
                                               "non-public"):
                    add_error(
                        errs, 5, "Invalid Required Field Value",
                        "The field 'accessLevel' had an invalid value: \"%s\""
                        % item["accessLevel"], dataset_name)

            # bureauCode # required
            if not is_redacted(item.get('bureauCode')):
                if check_required_field(item, "bureauCode", list, dataset_name,
                                        errs):
                    for bc in item["bureauCode"]:
                        if not isinstance(bc, (str, unicode)):
                            add_error(errs, 5, "Invalid Required Field Value",
                                      "Each bureauCode must be a string",
                                      dataset_name)
                        elif ":" not in bc:
                            add_error(
                                errs, 5, "Invalid Required Field Value",
                                "The bureau code \"%s\" is invalid. "
                                "Start with the agency code, then a colon, then the bureau code."
                                % bc, dataset_name)
                        elif bc not in omb_burueau_codes:
                            add_error(
                                errs, 5, "Invalid Required Field Value",
                                "The bureau code \"%s\" was not found in our list "
                                "(https://project-open-data.cio.gov/data/omb_bureau_codes.csv)."
                                % bc, dataset_name)

            # contactPoint # required
            if check_required_field(item, "contactPoint", dict, dataset_name,
                                    errs):
                cp = item["contactPoint"]
                # contactPoint - fn # required
                check_required_string_field(cp, "fn", 1, dataset_name, errs)

                # contactPoint - hasEmail # required
                if check_required_string_field(cp, "hasEmail", 9, dataset_name,
                                               errs):
                    if not is_redacted(cp.get('hasEmail')):
                        email = cp["hasEmail"].replace('mailto:', '')
                        if not email_validator(email):
                            add_error(
                                errs, 5, "Invalid Required Field Value",
                                "The email address \"%s\" is not a valid email address."
                                % email, dataset_name)

            # description # required
            check_required_string_field(item, "description", 1, dataset_name,
                                        errs)

            # identifier #required
            if check_required_string_field(item, "identifier", 1, dataset_name,
                                           errs):
                if item["identifier"] in seen_identifiers:
                    add_error(
                        errs, 5, "Invalid Required Field Value",
                        "The dataset identifier \"%s\" is used more than once."
                        % item["identifier"], dataset_name)
                seen_identifiers.add(item["identifier"])

            # keyword # required
            if isinstance(item.get("keyword"), (str, unicode)):
                if not is_redacted(item.get("keyword")):
                    add_error(
                        errs, 5, "Update Your File!",
                        "The keyword field used to be a string but now it must be an array.",
                        dataset_name)
            elif check_required_field(item, "keyword", list, dataset_name,
                                      errs):
                for kw in item["keyword"]:
                    if not isinstance(kw, (str, unicode)):
                        add_error(
                            errs, 5, "Invalid Required Field Value",
                            "Each keyword in the keyword array must be a string",
                            dataset_name)
                    elif len(kw.strip()) == 0:
                        add_error(
                            errs, 5, "Invalid Required Field Value",
                            "A keyword in the keyword array was an empty string.",
                            dataset_name)

            # modified # required
            if check_required_string_field(item, "modified", 1, dataset_name,
                                           errs):
                if not is_redacted(item['modified']) \
                        and not MODIFIED_REGEX_1.match(item['modified']) \
                        and not MODIFIED_REGEX_2.match(item['modified']) \
                        and not MODIFIED_REGEX_3.match(item['modified']):
                    add_error(
                        errs, 5, "Invalid Required Field Value",
                        "The field \"modified\" is not in valid format: \"%s\""
                        % item['modified'], dataset_name)

            # programCode # required
            if not is_redacted(item.get('programCode')):
                if check_required_field(item, "programCode", list,
                                        dataset_name, errs):
                    for pc in item["programCode"]:
                        if not isinstance(pc, (str, unicode)):
                            add_error(
                                errs, 5, "Invalid Required Field Value",
                                "Each programCode in the programCode array must be a string",
                                dataset_name)
                        elif not PROGRAM_CODE_REGEX.match(pc):
                            add_error(
                                errs, 50,
                                "Invalid Field Value (Optional Fields)",
                                "One of programCodes is not in valid format (ex. 018:001): \"%s\""
                                % pc, dataset_name)

            # publisher # required
            if check_required_field(item, "publisher", dict, dataset_name,
                                    errs):
                # publisher - name # required
                check_required_string_field(item["publisher"], "name", 1,
                                            dataset_name, errs)

            # Required-If-Applicable

            # dataQuality # Required-If-Applicable
            if item.get("dataQuality") is None or is_redacted(
                    item.get("dataQuality")):
                pass  # not required or REDACTED
            elif not isinstance(item["dataQuality"], bool):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'dataQuality' must be true or false, "
                    "as a JSON boolean literal (not the string \"true\" or \"false\").",
                    dataset_name)

            # distribution # Required-If-Applicable
            if item.get("distribution") is None:
                pass  # not required
            elif not isinstance(item["distribution"], list):
                if isinstance(item["distribution"],
                              (str, unicode)) and is_redacted(
                                  item.get("distribution")):
                    pass
                else:
                    add_error(
                        errs, 50, "Invalid Field Value (Optional Fields)",
                        "The field 'distribution' must be an array, if present.",
                        dataset_name)
            else:
                for j, dt in enumerate(item["distribution"]):
                    if isinstance(dt, (str, unicode)):
                        if is_redacted(dt):
                            continue
                    distribution_name = dataset_name + (" distribution %d" %
                                                        (j + 1))
                    # distribution - downloadURL # Required-If-Applicable
                    check_url_field(False,
                                    dt,
                                    "downloadURL",
                                    distribution_name,
                                    errs,
                                    allow_redacted=True)

                    # distribution - mediaType # Required-If-Applicable
                    if 'downloadURL' in dt:
                        if check_required_string_field(dt, "mediaType", 1,
                                                       distribution_name,
                                                       errs):
                            if not IANA_MIME_REGEX.match(dt["mediaType"]) \
                                    and not is_redacted(dt["mediaType"]):
                                add_error(
                                    errs, 5, "Invalid Field Value",
                                    "The distribution mediaType \"%s\" is invalid. "
                                    "It must be in IANA MIME format." %
                                    dt["mediaType"], distribution_name)

                    # distribution - accessURL # optional
                    check_url_field(False,
                                    dt,
                                    "accessURL",
                                    distribution_name,
                                    errs,
                                    allow_redacted=True)

                    # distribution - conformsTo # optional
                    check_url_field(False,
                                    dt,
                                    "conformsTo",
                                    distribution_name,
                                    errs,
                                    allow_redacted=True)

                    # distribution - describedBy # optional
                    check_url_field(False,
                                    dt,
                                    "describedBy",
                                    distribution_name,
                                    errs,
                                    allow_redacted=True)

                    # distribution - describedByType # optional
                    if dt.get("describedByType") is None or is_redacted(
                            dt.get("describedByType")):
                        pass  # not required or REDACTED
                    elif not IANA_MIME_REGEX.match(dt["describedByType"]):
                        add_error(
                            errs, 5, "Invalid Field Value",
                            "The describedByType \"%s\" is invalid. "
                            "It must be in IANA MIME format." %
                            dt["describedByType"], distribution_name)

                    # distribution - description # optional
                    if dt.get("description") is not None:
                        check_required_string_field(dt, "description", 1,
                                                    distribution_name, errs)

                    # distribution - format # optional
                    if dt.get("format") is not None:
                        check_required_string_field(dt, "format", 1,
                                                    distribution_name, errs)

                    # distribution - title # optional
                    if dt.get("title") is not None:
                        check_required_string_field(dt, "title", 1,
                                                    distribution_name, errs)

            # license # Required-If-Applicable
            check_url_field(False,
                            item,
                            "license",
                            dataset_name,
                            errs,
                            allow_redacted=True)

            # rights # Required-If-Applicable
            # TODO move to warnings
            # if item.get("accessLevel") != "public":
            # check_string_field(item, "rights", 1, dataset_name, errs)

            # spatial # Required-If-Applicable
            # TODO: There are more requirements than it be a string.
            if item.get("spatial") is not None and not isinstance(
                    item.get("spatial"), (str, unicode)):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'spatial' must be a string value if specified.",
                    dataset_name)

            # temporal # Required-If-Applicable
            if item.get("temporal") is None or is_redacted(
                    item.get("temporal")):
                pass  # not required or REDACTED
            elif not isinstance(item["temporal"], (str, unicode)):
                add_error(
                    errs, 10, "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' must be a string value if specified.",
                    dataset_name)
            elif "/" not in item["temporal"]:
                add_error(
                    errs, 10, "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' must be two dates separated by a forward slash.",
                    dataset_name)
            elif not TEMPORAL_REGEX_1.match(item['temporal']) \
                    and not TEMPORAL_REGEX_2.match(item['temporal']) \
                    and not TEMPORAL_REGEX_3.match(item['temporal']):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' has an invalid start or end date.",
                    dataset_name)

            # Expanded Fields

            # accrualPeriodicity # optional
            if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \
                    and not is_redacted(item.get("accrualPeriodicity")):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'accrualPeriodicity' had an invalid value.",
                    dataset_name)

            # conformsTo # optional
            check_url_field(False,
                            item,
                            "conformsTo",
                            dataset_name,
                            errs,
                            allow_redacted=True)

            # describedBy # optional
            check_url_field(False,
                            item,
                            "describedBy",
                            dataset_name,
                            errs,
                            allow_redacted=True)

            # describedByType # optional
            if item.get("describedByType") is None or is_redacted(
                    item.get("describedByType")):
                pass  # not required or REDACTED
            elif not IANA_MIME_REGEX.match(item["describedByType"]):
                add_error(
                    errs, 5, "Invalid Field Value",
                    "The describedByType \"%s\" is invalid. "
                    "It must be in IANA MIME format." %
                    item["describedByType"], dataset_name)

            # isPartOf # optional
            if item.get("isPartOf"):
                check_required_string_field(item, "isPartOf", 1, dataset_name,
                                            errs)

            # issued # optional
            if item.get("issued") is not None and not is_redacted(
                    item.get("issued")):
                if not ISSUED_REGEX.match(item['issued']):
                    add_error(errs, 50,
                              "Invalid Field Value (Optional Fields)",
                              "The field 'issued' is not in a valid format.",
                              dataset_name)

            # landingPage # optional
            check_url_field(False,
                            item,
                            "landingPage",
                            dataset_name,
                            errs,
                            allow_redacted=True)

            # language # optional
            if item.get("language") is None or is_redacted(
                    item.get("language")):
                pass  # not required or REDACTED
            elif not isinstance(item["language"], list):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'language' must be an array, if present.",
                    dataset_name)
            else:
                for s in item["language"]:
                    if not LANGUAGE_REGEX.match(s) and not is_redacted(s):
                        add_error(
                            errs, 50, "Invalid Field Value (Optional Fields)",
                            "The field 'language' had an invalid language: \"%s\""
                            % s, dataset_name)

            # PrimaryITInvestmentUII # optional
            if item.get("PrimaryITInvestmentUII") is None or is_redacted(
                    item.get("PrimaryITInvestmentUII")):
                pass  # not required or REDACTED
            elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(
                    item["PrimaryITInvestmentUII"]):
                add_error(
                    errs, 50, "Invalid Field Value (Optional Fields)",
                    "The field 'PrimaryITInvestmentUII' must be a string "
                    "in 023-000000001 format, if present.", dataset_name)

            # references # optional
            if item.get("references") is None:
                pass  # not required or REDACTED
            elif not isinstance(item["references"], list):
                if isinstance(item["references"],
                              (str, unicode)) and is_redacted(
                                  item.get("references")):
                    pass
                else:
                    add_error(
                        errs, 50, "Invalid Field Value (Optional Fields)",
                        "The field 'references' must be an array, if present.",
                        dataset_name)
            else:
                for s in item["references"]:
                    if not rfc3987_url.match(s) and not is_redacted(s):
                        add_error(
                            errs, 50, "Invalid Field Value (Optional Fields)",
                            "The field 'references' had an invalid rfc3987 URL: \"%s\""
                            % s, dataset_name)

            # systemOfRecords # optional
            check_url_field(False,
                            item,
                            "systemOfRecords",
                            dataset_name,
                            errs,
                            allow_redacted=True)

            # theme #optional
            if item.get("theme") is None or is_redacted(item.get("theme")):
                pass  # not required or REDACTED
            elif not isinstance(item["theme"], list):
                add_error(errs, 50, "Invalid Field Value (Optional Fields)",
                          "The field 'theme' must be an array.", dataset_name)
            else:
                for s in item["theme"]:
                    if not isinstance(s, (str, unicode)):
                        add_error(
                            errs, 50, "Invalid Field Value (Optional Fields)",
                            "Each value in the theme array must be a string",
                            dataset_name)
                    elif len(s.strip()) == 0:
                        add_error(
                            errs, 50, "Invalid Field Value (Optional Fields)",
                            "A value in the theme array was an empty string.",
                            dataset_name)

    # Form the output data.
    for err_type in sorted(errs):
        errors_array.append((
            err_type[1],  # heading
            [
                err_item + (" (%d locations)" % len(errs[err_type][err_item])
                            if len(errs[err_type][err_item]) else "")
                for err_item in sorted(
                    errs[err_type], key=lambda x: (-len(errs[err_type][x]), x))
            ]))
Exemplo n.º 38
0
 def _is_valid(self, instance) -> bool:
     return instance is not None and rfc3987.match(str(instance), 'IRI')
Exemplo n.º 39
0
def valid_url(url):
    iri_match = iri.match(url, rule='absolute_IRI')
    return iri_match is not None
Exemplo n.º 40
0
 def __init__(self, is_optional=False, *args, **kwargs):
     Property.__init__(self, 'url', "", is_optional, *args, **kwargs)
     self._set_validator(lambda val: (isinstance(val, str) or isinstance(val, unicode)) and (len(val) > 0) and
                                     rfc3987.match(val, 'URI') is not None)
Exemplo n.º 41
0
 def url_valid(url):
     if rfc3987.match(url, rule='URI'):
         return True
     return False
Exemplo n.º 42
0
def do_validation(doc, errors_array, seen_identifiers):
    errs = {}

    if type(doc) != list:
        add_error(
            errs,
            0,
            "Bad JSON Structure",
            "The file must be an array at its top level. "
            "That means the file starts with an open bracket [ and ends with a close bracket ].",
        )
    elif len(doc) == 0:
        add_error(errs, 0, "Catalog Is Empty", "There are no entries in your file.")
    else:
        for i, item in enumerate(doc):
            # Required

            dataset_name = "dataset %d" % (i + 1)

            # title
            if check_required_string_field(item, "title", 1, dataset_name, errs):
                dataset_name = '"%s"' % item.get("title", "").strip()

            # accessLevel # required
            if check_required_string_field(item, "accessLevel", 3, dataset_name, errs):
                if item["accessLevel"] not in ("public", "restricted public", "non-public"):
                    add_error(
                        errs,
                        5,
                        "Invalid Required Field Value",
                        "The field 'accessLevel' had an invalid value: \"%s\"" % item["accessLevel"],
                        dataset_name,
                    )

            # bureauCode # required
            if not is_redacted(item.get("bureauCode")):
                if check_required_field(item, "bureauCode", list, dataset_name, errs):
                    for bc in item["bureauCode"]:
                        if not isinstance(bc, (str, unicode)):
                            add_error(
                                errs,
                                5,
                                "Invalid Required Field Value",
                                "Each bureauCode must be a string",
                                dataset_name,
                            )
                        elif ":" not in bc:
                            add_error(
                                errs,
                                5,
                                "Invalid Required Field Value",
                                'The bureau code "%s" is invalid. '
                                "Start with the agency code, then a colon, then the bureau code." % bc,
                                dataset_name,
                            )
                        elif bc not in omb_burueau_codes:
                            add_error(
                                errs,
                                5,
                                "Invalid Required Field Value",
                                'The bureau code "%s" was not found in our list '
                                "(https://project-open-data.cio.gov/data/omb_bureau_codes.csv)." % bc,
                                dataset_name,
                            )

            # contactPoint # required
            if check_required_field(item, "contactPoint", dict, dataset_name, errs):
                cp = item["contactPoint"]
                # contactPoint - fn # required
                check_required_string_field(cp, "fn", 1, dataset_name, errs)

                # contactPoint - hasEmail # required
                if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs):
                    if not is_redacted(cp.get("hasEmail")):
                        email = cp["hasEmail"].replace("mailto:", "")
                        if not email_validator(email):
                            add_error(
                                errs,
                                5,
                                "Invalid Required Field Value",
                                'The email address "%s" is not a valid email address.' % email,
                                dataset_name,
                            )

            # description # required
            check_required_string_field(item, "description", 1, dataset_name, errs)

            # identifier #required
            if check_required_string_field(item, "identifier", 1, dataset_name, errs):
                if item["identifier"] in seen_identifiers:
                    add_error(
                        errs,
                        5,
                        "Invalid Required Field Value",
                        'The dataset identifier "%s" is used more than once.' % item["identifier"],
                        dataset_name,
                    )
                seen_identifiers.add(item["identifier"])

            # keyword # required
            if isinstance(item.get("keyword"), (str, unicode)):
                if not is_redacted(item.get("keyword")):
                    add_error(
                        errs,
                        5,
                        "Update Your File!",
                        "The keyword field used to be a string but now it must be an array.",
                        dataset_name,
                    )
            elif check_required_field(item, "keyword", list, dataset_name, errs):
                for kw in item["keyword"]:
                    if not isinstance(kw, (str, unicode)):
                        add_error(
                            errs,
                            5,
                            "Invalid Required Field Value",
                            "Each keyword in the keyword array must be a string",
                            dataset_name,
                        )
                    elif len(kw.strip()) == 0:
                        add_error(
                            errs,
                            5,
                            "Invalid Required Field Value",
                            "A keyword in the keyword array was an empty string.",
                            dataset_name,
                        )

            # modified # required
            if check_required_string_field(item, "modified", 1, dataset_name, errs):
                if (
                    not is_redacted(item["modified"])
                    and not MODIFIED_REGEX_1.match(item["modified"])
                    and not MODIFIED_REGEX_2.match(item["modified"])
                    and not MODIFIED_REGEX_3.match(item["modified"])
                ):
                    add_error(
                        errs,
                        5,
                        "Invalid Required Field Value",
                        'The field "modified" is not in valid format: "%s"' % item["modified"],
                        dataset_name,
                    )

            # programCode # required
            if not is_redacted(item.get("programCode")):
                if check_required_field(item, "programCode", list, dataset_name, errs):
                    for pc in item["programCode"]:
                        if not isinstance(pc, (str, unicode)):
                            add_error(
                                errs,
                                5,
                                "Invalid Required Field Value",
                                "Each programCode in the programCode array must be a string",
                                dataset_name,
                            )
                        elif not PROGRAM_CODE_REGEX.match(pc):
                            add_error(
                                errs,
                                50,
                                "Invalid Field Value (Optional Fields)",
                                'One of programCodes is not in valid format (ex. 018:001): "%s"' % pc,
                                dataset_name,
                            )

            # publisher # required
            if check_required_field(item, "publisher", dict, dataset_name, errs):
                # publisher - name # required
                check_required_string_field(item["publisher"], "name", 1, dataset_name, errs)

            # Required-If-Applicable

            # dataQuality # Required-If-Applicable
            if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")):
                pass  # not required or REDACTED
            elif not isinstance(item["dataQuality"], bool):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'dataQuality' must be true or false, "
                    'as a JSON boolean literal (not the string "true" or "false").',
                    dataset_name,
                )

            # distribution # Required-If-Applicable
            if item.get("distribution") is None:
                pass  # not required
            elif not isinstance(item["distribution"], list):
                if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")):
                    pass
                else:
                    add_error(
                        errs,
                        50,
                        "Invalid Field Value (Optional Fields)",
                        "The field 'distribution' must be an array, if present.",
                        dataset_name,
                    )
            else:
                for j, dt in enumerate(item["distribution"]):
                    if isinstance(dt, (str, unicode)):
                        if is_redacted(dt):
                            continue
                    distribution_name = dataset_name + (" distribution %d" % (j + 1))
                    # distribution - downloadURL # Required-If-Applicable
                    check_url_field(False, dt, "downloadURL", distribution_name, errs, allow_redacted=True)

                    # distribution - mediaType # Required-If-Applicable
                    if "downloadURL" in dt:
                        if check_required_string_field(dt, "mediaType", 1, distribution_name, errs):
                            if not IANA_MIME_REGEX.match(dt["mediaType"]) and not is_redacted(dt["mediaType"]):
                                add_error(
                                    errs,
                                    5,
                                    "Invalid Field Value",
                                    'The distribution mediaType "%s" is invalid. '
                                    "It must be in IANA MIME format." % dt["mediaType"],
                                    distribution_name,
                                )

                    # distribution - accessURL # optional
                    check_url_field(False, dt, "accessURL", distribution_name, errs, allow_redacted=True)

                    # distribution - conformsTo # optional
                    check_url_field(False, dt, "conformsTo", distribution_name, errs, allow_redacted=True)

                    # distribution - describedBy # optional
                    check_url_field(False, dt, "describedBy", distribution_name, errs, allow_redacted=True)

                    # distribution - describedByType # optional
                    if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")):
                        pass  # not required or REDACTED
                    elif not IANA_MIME_REGEX.match(dt["describedByType"]):
                        add_error(
                            errs,
                            5,
                            "Invalid Field Value",
                            'The describedByType "%s" is invalid. '
                            "It must be in IANA MIME format." % dt["describedByType"],
                            distribution_name,
                        )

                    # distribution - description # optional
                    if dt.get("description") is not None:
                        check_required_string_field(dt, "description", 1, distribution_name, errs)

                    # distribution - format # optional
                    if dt.get("format") is not None:
                        check_required_string_field(dt, "format", 1, distribution_name, errs)

                    # distribution - title # optional
                    if dt.get("title") is not None:
                        check_required_string_field(dt, "title", 1, distribution_name, errs)

            # license # Required-If-Applicable
            check_url_field(False, item, "license", dataset_name, errs, allow_redacted=True)

            # rights # Required-If-Applicable
            # TODO move to warnings
            # if item.get("accessLevel") != "public":
            # check_string_field(item, "rights", 1, dataset_name, errs)

            # spatial # Required-If-Applicable
            # TODO: There are more requirements than it be a string.
            if item.get("spatial") is not None and not isinstance(item.get("spatial"), (str, unicode)):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'spatial' must be a string value if specified.",
                    dataset_name,
                )

            # temporal # Required-If-Applicable
            if item.get("temporal") is None or is_redacted(item.get("temporal")):
                pass  # not required or REDACTED
            elif not isinstance(item["temporal"], (str, unicode)):
                add_error(
                    errs,
                    10,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' must be a string value if specified.",
                    dataset_name,
                )
            elif "/" not in item["temporal"]:
                add_error(
                    errs,
                    10,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' must be two dates separated by a forward slash.",
                    dataset_name,
                )
            elif (
                not TEMPORAL_REGEX_1.match(item["temporal"])
                and not TEMPORAL_REGEX_2.match(item["temporal"])
                and not TEMPORAL_REGEX_3.match(item["temporal"])
            ):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'temporal' has an invalid start or end date.",
                    dataset_name,
                )

            # Expanded Fields

            # accrualPeriodicity # optional
            if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES and not is_redacted(
                item.get("accrualPeriodicity")
            ):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'accrualPeriodicity' had an invalid value.",
                    dataset_name,
                )

            # conformsTo # optional
            check_url_field(False, item, "conformsTo", dataset_name, errs, allow_redacted=True)

            # describedBy # optional
            check_url_field(False, item, "describedBy", dataset_name, errs, allow_redacted=True)

            # describedByType # optional
            if item.get("describedByType") is None or is_redacted(item.get("describedByType")):
                pass  # not required or REDACTED
            elif not IANA_MIME_REGEX.match(item["describedByType"]):
                add_error(
                    errs,
                    5,
                    "Invalid Field Value",
                    'The describedByType "%s" is invalid. ' "It must be in IANA MIME format." % item["describedByType"],
                    dataset_name,
                )

            # isPartOf # optional
            if item.get("isPartOf"):
                check_required_string_field(item, "isPartOf", 1, dataset_name, errs)

            # issued # optional
            if item.get("issued") is not None and not is_redacted(item.get("issued")):
                if not ISSUED_REGEX.match(item["issued"]):
                    add_error(
                        errs,
                        50,
                        "Invalid Field Value (Optional Fields)",
                        "The field 'issued' is not in a valid format.",
                        dataset_name,
                    )

            # landingPage # optional
            check_url_field(False, item, "landingPage", dataset_name, errs, allow_redacted=True)

            # language # optional
            if item.get("language") is None or is_redacted(item.get("language")):
                pass  # not required or REDACTED
            elif not isinstance(item["language"], list):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'language' must be an array, if present.",
                    dataset_name,
                )
            else:
                for s in item["language"]:
                    if not LANGUAGE_REGEX.match(s) and not is_redacted(s):
                        add_error(
                            errs,
                            50,
                            "Invalid Field Value (Optional Fields)",
                            "The field 'language' had an invalid language: \"%s\"" % s,
                            dataset_name,
                        )

            # PrimaryITInvestmentUII # optional
            if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")):
                pass  # not required or REDACTED
            elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'PrimaryITInvestmentUII' must be a string " "in 023-000000001 format, if present.",
                    dataset_name,
                )

            # references # optional
            if item.get("references") is None:
                pass  # not required or REDACTED
            elif not isinstance(item["references"], list):
                if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")):
                    pass
                else:
                    add_error(
                        errs,
                        50,
                        "Invalid Field Value (Optional Fields)",
                        "The field 'references' must be an array, if present.",
                        dataset_name,
                    )
            else:
                for s in item["references"]:
                    if not rfc3987_url.match(s) and not is_redacted(s):
                        add_error(
                            errs,
                            50,
                            "Invalid Field Value (Optional Fields)",
                            "The field 'references' had an invalid rfc3987 URL: \"%s\"" % s,
                            dataset_name,
                        )

                if len(item["references"]) != len(set(item["references"])):
                    add_error(
                        errs,
                        50,
                        "Invalid Field Value (Optional Fields)",
                        "The field 'references' has duplicates",
                        dataset_name,
                    )

            # systemOfRecords # optional
            check_url_field(False, item, "systemOfRecords", dataset_name, errs, allow_redacted=True)

            # theme #optional
            if item.get("theme") is None or is_redacted(item.get("theme")):
                pass  # not required or REDACTED
            elif not isinstance(item["theme"], list):
                add_error(
                    errs,
                    50,
                    "Invalid Field Value (Optional Fields)",
                    "The field 'theme' must be an array.",
                    dataset_name,
                )
            else:
                for s in item["theme"]:
                    if not isinstance(s, (str, unicode)):
                        add_error(
                            errs,
                            50,
                            "Invalid Field Value (Optional Fields)",
                            "Each value in the theme array must be a string",
                            dataset_name,
                        )
                    elif len(s.strip()) == 0:
                        add_error(
                            errs,
                            50,
                            "Invalid Field Value (Optional Fields)",
                            "A value in the theme array was an empty string.",
                            dataset_name,
                        )

    # Form the output data.
    for err_type in sorted(errs):
        errors_array.append(
            (
                err_type[1],  # heading
                [
                    err_item
                    + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "")
                    for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x))
                ],
            )
        )
Exemplo n.º 43
0
def is_valid_url(a_string):
    """Check if a string is a valid URL."""
    return rfc3987.match(a_string, 'URI') is not None
Exemplo n.º 44
0
def run(repo):

    log.info("Request to: " + request.base_url)
    log.info("workspace:" + workspace)
    repopath = posixpath.join(workspace, repo)

    log.info("repopath:" + repopath)

    log.info("Running " + repo + " repository in " + workspace)
    log.info(repopath)

    if not status(repopath):
        log.debug("Repository " + repo + " doesnt exist in " + repopath)
        abort(404)
    else:
        log.info("Repository size: " + size(repopath))

    graph = request.args.get('graph', '')
    branch = request.args.get('branch', '')
    checkoutBranch(branch, repopath)

    if graph == "":
        filelist = lsfiles(repopath)
        return render_template('list.html', data=pathsToURIs(repo, filelist))
    elif not match(graph, rule='absolute_IRI'):
        log.debug("Graph URI is not valid")
        abort(406)

    if notSupportedContentType(request):
        abort(412)

    fileGraph = FileGraph(graph, domain, repopath, repo)

    if request.method == 'GET':
        if fileGraph.doExists():
            return send_file(fileGraph.filepath, 'text/turtle')
        else:
            abort(404)
    elif request.method == 'POST':
        #TODO: Abort on history branches. Create list of actual branches?
        log.debug("POST to " + fileGraph.iri)
        if fileGraph.doExists():
            fileGraph.parsePath()
        fileGraph.parseString(request.data.decode('utf-8'))
        fileGraph.serialize()
        autoAddAndCommit(fileGraph, "POST - Adding graph " + fileGraph.iri)
        return loglast(repopath)
    elif request.method == 'PUT':
        #TODO: Abort on history branches
        log.debug("PUT to " + fileGraph.iri)
        fileGraph.parseString(request.data.decode('utf-8'))
        fileGraph.serialize()
        autoAddAndCommit(fileGraph, "PUT - Writing graph " + fileGraph.iri)
        return loglast(repopath)
    elif request.method == 'DELETE':
        #TODO: Abort on history branches
        log.debug("DELETE to " + fileGraph.iri)
        if fileGraph.doExists():
            deleteFile(fileGraph.filepath)
            autoAddAndCommit(fileGraph,
                             "DELETE - Removing graph " + fileGraph.iri)
            return ('', 204)
        else:
            log.debug("GRAPH " + fileGraph.iri + " DOES NOT EXIST in " +
                      fileGraph.filepath)
            abort(404)
    else:
        abort(406)
Exemplo n.º 45
0
    def __init__(self, ref_url, stop_words=None, late_kills=None):
        """
        Arguments:
        ----------
        ref_url : str
            Web page from which search terms are to be extracted
        stop_words : sequence or set
            List or set with common words to be excluded from search string.
            `stop_list` will be applied *before* any multiple word phrases are
            constructed.
        late_kills : sequence or set
            Like `stop_words` but the words in `late_kills` will only be
            eliminated from the search string *after* multiple word phrases
            have been constructed. Thus you can have a word like 'report'
            appear in the search string as part of a multiple word phrase
            ("OECD Report on Public Health") but not as a single word (which
            would have almost zero selectivity for a news article).
        """
        if not rfc3987.match(ref_url, rule='URI_reference'):
            raise InvalidUrlError(ref_url)
        # check if url points to non-html:
        p_url = urlparse(ref_url)
        if p_url[2].endswith(tuple(WebArticle.exclude_formats)):
            raise ArticleFormatError(p_url[2])

        self._stop_words = frozenset(stop_words) if stop_words else []
        self._late_kills = frozenset(late_kills) if late_kills else []

        self.url = ref_url
        try:
            result = requests.get(ref_url, headers={'User-Agent':
                    random.choice(REF.user_agents)}, proxies=_get_proxies())
        except requests.exceptions.RequestException:
            raise PageRetrievalError(ref_url)
        if not result.status_code == requests.codes.ok:
            raise PageRetrievalError(ref_url)
        ht = result.text
        encoding = result.encoding if result.encoding else 'utf-8'
        ht = ht.encode(result.encoding) if isinstance(ht, unicode) else ht
        # need to parse only to check for excessive number of headings
        parsed = html.fromstring(ht)
        if max([len(parsed.xpath('//h{0}'.format(i+1)))
                for i in xrange(4)]) > WebArticle.max_headings:
            logging.debug("too many headings in %s, raising exception",
                    ref_url)
            raise NotAnArticleError(ref_url)
        # now get the article content
        g = Goose()
        try:
            article = g.extract(raw_html=ht)
        except ValueError:
            logging.debug("could not extract article from %s" % ref_url)
            raise ArticleExtractionError(ref_url)
        self.title = article.title
        self.text = article.cleaned_text
        if not self.text:
            logging.debug("could not extract article from %s" % ref_url)
            raise ArticleExtractionError(ref_url)

        self.wlist = build_wlist(self.text)
        self.wcount = len(self.wlist)
        logging.debug("built %d word list for article \"%s\"" %
                (self.wcount, self.title))

        sl = [WebArticle.stemmer.stem(w) for w in self.wlist if w not in
              self._stop_words.union(self._late_kills) and len(w) > 2]
        self.stem_tops = Counter(sl).most_common()
Exemplo n.º 46
0
    def analyze(self, graph):
        """Analysis of SKOS concepts and related properties presence in a dataset."""
        log = logging.getLogger(__name__)
        concept_count = dict()
        schemes_count = dict()
        top_concept = dict()

        concepts = [
            row['concept'] for row in graph.query("""
        SELECT DISTINCT ?concept WHERE {
            ?concept a <http://www.w3.org/2004/02/skos/core#Concept>.
        }
        """)
        ]

        for c in concepts:
            if not rfc3987.match(c):
                log.debug(f'{c} is a not valid IRI')
                continue
            for row in graph.query(SkosAnalyzer._count_query(c)):
                concept_count[c] = row['count']

        schemes = [
            row['scheme'] for row in graph.query("""
        SELECT DISTINCT ?scheme WHERE {
            OPTIONAL {?scheme a <http://www.w3.org/2004/02/skos/core#ConceptScheme>.}
            OPTIONAL {?_ <http://www.w3.org/2004/02/skos/core#inScheme> ?scheme.}
        }
        """)
        ]

        for schema in schemes:
            if not rfc3987.match(schema):
                log.debug(f'{schema} is a not valid IRI')
                continue
            for row in graph.query(
                    SkosAnalyzer._scheme_count_query(str(schema))):
                schemes_count[schema] = row['count']

        for schema in schemes:
            if not rfc3987.match(schema):
                continue
            top_concept[schema] = [
                row['concept'] for row in graph.query(
                    SkosAnalyzer._scheme_top_concept(str(schema)))
            ]

        collections = [
            row['coll'] for row in graph.query("""
        SELECT DISTINCT ?coll WHERE {
            OPTIONAL { ?coll a <http://www.w3.org/2004/02/skos/core#Collection>. }
            OPTIONAL { ?coll a <http://www.w3.org/2004/02/skos/core#OrderedCollection>. }
            OPTIONAL { ?a <http://www.w3.org/2004/02/skos/core#member> ?coll. }
            OPTIONAL { ?coll <http://www.w3.org/2004/02/skos/core#memberList> ?b. }
        }
        """)
        ]

        ord_collections = [
            row['coll'] for row in graph.query("""
        SELECT DISTINCT ?coll WHERE {
            ?coll a <http://www.w3.org/2004/02/skos/core#OrderedCollection>.
        }
        """)
        ]

        return {
            'concept': concept_count,
            'schema': schemes_count,
            'topConcepts': top_concept,
            'collection': collections,
            'orderedCollection': ord_collections
        }
Exemplo n.º 47
0
# -*- coding: utf-8 -*-
import sys;
import os;
import time;
import re;
import tempfile;
import urllib;
# import urllib.request;
import rfc3987;

dir = tempfile.gettempdir() + "/lview";
# os.makedirs(dir, exist_ok=True);
if (not os.path.exists(dir)):
  os.makedirs(dir);

for arg in sys.argv[1:]:
  if (rfc3987.match(arg, "URI")):
    h = rfc3987.parse(arg, "URI");
    filename = os.path.basename(h["path"]);
    ext = os.path.splitext(filename)[1];

    if (ext.lower() in ( ".bmp", ".jpg", ".jpeg", ".png", ".gif" )):
      response = urllib.urlopen(arg);
      # response = urllib.request.urlopen(arg);
      filename = str(int(time.time())) + "_" + os.path.basename(h["path"]);
      fpw = open(dir + "/" + filename, "wb");
      fpw.write(response.read());
      fpw.close();
Exemplo n.º 48
0
def _dcat_extractor(g, red, log):
    distributions, distributions_priority = [], []
    endpoints = []
    dcat = Namespace('http://www.w3.org/ns/dcat#')
    dcterms = Namespace('http://purl.org/dc/terms/')
    nkod = Namespace('https://data.gov.cz/slovník/nkod/mediaTyp')
    media_priority = set([
        'https://www.iana.org/assignments/media-types/application/rdf+xml',
        'https://www.iana.org/assignments/media-types/application/trig',
        'https://www.iana.org/assignments/media-types/text/n3',
        'https://www.iana.org/assignments/media-types/application/ld+json',
        'https://www.iana.org/assignments/media-types/application/n-triples',
        'https://www.iana.org/assignments/media-types/application/n-quads',
        'https://www.iana.org/assignments/media-types/text/turtle'
    ])  #IANA
    format_priority = set([
        'http://publications.europa.eu/resource/authority/file-type/RDF',
        'http://publications.europa.eu/resource/authority/file-type/RDFA',
        'http://publications.europa.eu/resource/authority/file-type/RDF_N_QUADS',
        'http://publications.europa.eu/resource/authority/file-type/RDF_N_TRIPLES',
        'http://publications.europa.eu/resource/authority/file-type/RDF_TRIG',
        'http://publications.europa.eu/resource/authority/file-type/RDF_TURTLE',
        'http://publications.europa.eu/resource/authority/file-type/RDF_XML',
        'http://publications.europa.eu/resource/authority/file-type/JSON_LD',
        'http://publications.europa.eu/resource/authority/file-type/N3'
    ])  #EU
    queue = distributions

    log.info("Extracting distributions")
    #DCAT dataset
    dsdistr, distrds = ds_distr()
    with red.pipeline() as pipe:
        for ds in g.subjects(RDF.type, dcat.Dataset):
            #dataset titles (possibly multilang)
            for t in g.objects(ds, dcterms.title):
                key = ds_title(ds, t.language)
                red.set(key, t.value)

            #DCAT Distribution
            for d in g.objects(ds, dcat.distribution):
                # put RDF distributions into a priority queue
                for media in g.objects(d, dcat.mediaType):
                    if str(media) in media_priority:
                        queue = distributions_priority

                for format in g.objects(d, dcterms.format):
                    if str(format) in format_priority:
                        queue = distributions_priority

                # data.gov.cz specific
                for format in g.objects(d, nkod.mediaType):
                    if 'rdf' in str(format):
                        queue = distributions_priority

                # download URL to files
                for downloadURL in g.objects(d, dcat.downloadURL):
                    if rfc3987.match(str(downloadURL)):
                        log.debug(
                            f'Distribution {downloadURL!s} from DCAT dataset {ds!s}'
                        )
                        queue.append(downloadURL)
                        pipe.hset(dsdistr, str(ds), str(downloadURL))
                        pipe.hset(distrds, str(downloadURL), str(ds))
                    else:
                        log.warn(f'{access!s} is not a valid download URL')

                # scan for DCAT2 data services here as well
                for access in g.objects(d, dcat.accessURL):
                    for endpoint in g.objects(access, dcat.endpointURL):
                        if rfc3987.match(str(endpoint)):
                            log.debug(
                                f'Endpoint {endpoint!s} from DCAT dataset {ds!s}'
                            )
                            endpoints.append(endpoint)

                            pipe.hset(dsdistr, str(ds), str(endpoint))
                            pipe.hset(distrds, str(endpoint), str(ds))
                    else:
                        log.warn(f'{endpoint!s} is not a valid endpoint URL')

        pipe.sadd('purgeable', dsdistr, distrds)
        # TODO: expire
        pipe.execute()
    # TODO: possibly scan for service description as well

    tasks = [process_priority.si(a) for a in distributions_priority]
    tasks.extend(process_endpoint.si(e) for e in endpoints)
    tasks.extend(process.si(a) for a in distributions)
    return group(tasks).apply_async()
Exemplo n.º 49
0
 def _validate_type_url(self, value):
     """ Enables validation for json objects
     """
     if match(value, rule='URI'):
         return True
Exemplo n.º 50
0
 def __init__(self, url):
     if (rfc3987.match(url, 'URI') == None or not re.match("^http(s)?://.*", url)):
         raise ValueError('Invalid URL')
     else:
         self.url = url
         self.__getStaStatus(self.__getMessageFromStaRss())
Exemplo n.º 51
0
 def isValidURL(untestedStr):
     return match(untestedStr, rule="IRI") is not None
Exemplo n.º 52
0
 def is_type(cls, other):
     if type(other) is str:
         return rfc3987.match(other, rule='URI')
     else:
         return isinstance(other, HTMLURI)