Exemplo n.º 1
0
def normalize_url(url):
    purl = rfc3986.urlparse(url)

    if purl.scheme is None and purl.host is None and purl.path is not None:
        # no protocol, no // : it is a path according to the rfc3986
        # but we know it is a host
        purl = rfc3986.urlparse('//' + url)

    if purl.scheme is None:
        # The url starts with //
        # Add https (or http for .onion or i2p TLD)
        if model.host_use_http(purl.host):
            purl = purl.copy_with(scheme='http')
        else:
            purl = purl.copy_with(scheme='https')

    # first normalization
    # * idna encoding to avoid misleading host
    # * remove query and fragment
    # * remove empty path
    purl = purl.copy_with(scheme=purl.scheme.lower(),
                          host=idna.encode(purl.host).decode('utf-8').lower(),
                          path='' if purl.path == '/' else purl.path,
                          query=None,
                          fragment=None)

    # only https (exception: http for .onion and .i2p TLD)
    if (purl.scheme == 'https' and not model.host_use_http(purl.host)) or\
       (purl.scheme == 'http' and model.host_use_http(purl.host)):
        # normalize the URL
        return rfc3986.normalize_uri(purl.geturl())

    #
    return None
Exemplo n.º 2
0
    def __init__(self, scope, receive):
        self._starlette = StarletteRequest(scope, receive)
        self.formats = None
        self.encoding = "utf-8"

        headers = CaseInsensitiveDict()
        for header, value in self._starlette.headers.items():
            headers[header] = value

        self.headers = (
            headers
        )  #: A case-insensitive dictionary, containing all headers sent in the Request.

        self.mimetype = self.headers.get("Content-Type", "")

        self.method = (
            self._starlette.method.lower()
        )  #: The incoming HTTP method used for the request, lower-cased.

        self.full_url = str(
            self._starlette.url
        )  #: The full URL of the Request, query parameters and all.

        self.url = rfc3986.urlparse(
            self.full_url)  #: The parsed URL of the Request
        try:
            self.params = QueryDict(
                self.url.query
            )  #: A dictionary of the parsed query parameters used for the Request.
        except AttributeError:
            self.params = {}
Exemplo n.º 3
0
 def __repr__(self) -> str:
     class_name = self.__class__.__name__
     url_str = str(self)
     if self._uri_reference.userinfo:
         url_str = (rfc3986.urlparse(url_str).copy_with(
             userinfo=f"{self.username}:[secure]").unsplit())
     return f"{class_name}({url_str!r})"
Exemplo n.º 4
0
def str_parse(text):
    n = urlparse(text.encode('utf8'))
    if n.scheme is None:
        if n.path is None:
            return u''
        return n.path
    else:
        return text
Exemplo n.º 5
0
def get_file_uri(url):
    url = rfc3986.urlparse(url)
    try:
        url = validate_uri(url)
    except (MissingComponentError, UnpermittedComponentError,
            InvalidComponentsError) as e:
        logging.error("File uri '{0}' not valid".format(url))
        raise e
    return url
def save_acme_key_as_file(logger, bytes, user_provided_path):
    uri = rfc3986.urlparse(user_provided_path)
    if uri.scheme == "file":
        path = get_filepath(uri)
        save_file_to_disc(logger, bytes, path)
    elif uri.scheme == "s3":
        save_file_to_s3(logger, bytes, uri)
    else:
        raise ValueError(
            "Invalid acme account key: {!r}".format(user_provided_path))
Exemplo n.º 7
0
 def __repr__(self) -> str:
     class_name = self.__class__.__name__
     url_str = str(self)
     if self._uri_reference.userinfo:
         # Mask any password component in the URL representation, to lower the
         # risk of unintended leakage, such as in debug information and logging.
         username = quote(self.username)
         url_str = (rfc3986.urlparse(url_str).copy_with(
             userinfo=f"{username}:[secure]").unsplit())
     return f"{class_name}({url_str!r})"
Exemplo n.º 8
0
    def __init__(self):
        ipfs_rpc_endpoint = "https://ipfs.singularitynet.io:80"
        ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint)
        ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http"
        ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001

        self.ipfs_client = ipfsapi.connect(urljoin(ipfs_scheme,
                                                   ipfs_rpc_endpoint.hostname),
                                           ipfs_port,
                                           session=True)
Exemplo n.º 9
0
def url_path(url_path):
    """Raises an error if the url_path doesn't look like a URL Path."""
    try:
        p_url = rfc3986.urlparse(rfc3986.normalize_uri(url_path))

        if (p_url.scheme or p_url.userinfo or p_url.host or p_url.port
                or p_url.path is None or not p_url.path.startswith('/')):
            raise exceptions.InvalidURLPath(url_path=url_path)
    except Exception:
        raise exceptions.InvalidURLPath(url_path=url_path)
    return True
Exemplo n.º 10
0
def url(url):
    """Raises an error if the url doesn't look like a URL."""
    try:
        if not rfc3986.is_valid_uri(url, require_scheme=True):
            raise exceptions.InvalidURL(url=url)
        p_url = rfc3986.urlparse(rfc3986.normalize_uri(url))
        if p_url.scheme != 'http' and p_url.scheme != 'https':
            raise exceptions.InvalidURL(url=url)
    except Exception:
        raise exceptions.InvalidURL(url=url)
    return True
Exemplo n.º 11
0
def url(url):
    """Raises an error if the url doesn't look like a URL."""
    try:
        if not rfc3986.is_valid_uri(url, require_scheme=True):
            raise exceptions.InvalidURL(url=url)
        p_url = rfc3986.urlparse(rfc3986.normalize_uri(url))
        if p_url.scheme != 'http' and p_url.scheme != 'https':
            raise exceptions.InvalidURL(url=url)
    except Exception:
        raise exceptions.InvalidURL(url=url)
    return True
Exemplo n.º 12
0
def url_validation(url):
    nurl = rfc3986.normalize_uri(url)
    if nurl != url:
        return False, f'URL must be normalized to {nurl}'
    purl = rfc3986.urlparse(nurl)
    if not ((purl.scheme == 'https' and not host_use_http(purl.host)) or
            (purl.scheme == 'http' and host_use_http(purl.host))):
        return False, 'the protocol is neither https nor http with an .onion/.i2p TLD'
    if purl.query is not None:
        return False, 'no query in the URL'
    if purl.fragment is not None:
        return False, 'no fragment in the URL'
    return True, None
 def _validate_info(self, broker_info) -> False:
     self.logger.debug("Validating " + broker_info)
     parseduri = urlparse(broker_info)
     if not (parseduri.scheme in ["mqtt", "ws"]):
         return False
     self.broker_url = parseduri.host
     self.broker_port = parseduri.port
     self.broker_user = parseduri.userinfo
     self.logger.debug("broker_user {}".format(self.broker_user))
     self.logger.debug("broker_url {}, broker_port: {}".format(self.broker_url, self.broker_port))
     if not (self.broker_url and self.broker_port):
         return False
     return True
Exemplo n.º 14
0
def _parse_sophora_url(url: str) -> Tuple[str, str, Optional[int]]:

    # Special cases
    if url == "https://www1.wdr.de/nachrichten/nrw":
        # TODO: Investigate if there are more like this
        url = "https://www1.wdr.de/nachrichten/index.html"

    # Ensure that overview pages with missing "index.html" suffix
    # get related to the same SophoraID
    if url.endswith("/"):
        logger.debug("Adding index.html suffix")
        url = url + "index.html"

    parsed = urlparse(url)
    match = re.match(
        r"(.*)/(.*?)(?:~_page-(\d+))?\.(?:html|amp)$",
        unquote(parsed.path),
    )

    if match is None:
        # Parsing errors that are known and we want to ignore
        match_expected = re.match(
            r".*\.(?:jsp|pdf|news)$",
            unquote(parsed.path),
        ) or re.match(
            r".*/:~:text=.*$",
            unquote(parsed.path),
        )

        if match_expected is None:
            logger.error("Unexpected parsing error: {}", url)
            sentry_sdk.capture_message(
                f"Failed parsing URL with unexpected format: {url}",
                level="error",
            )
        else:
            logger.debug("Ignored parsing error: {}", url)

        raise SkipPageException(url)

    node = match.group(1)
    sophora_id = match.group(2)
    # Cut off any other weird Sophora parameters
    sophora_id = re.sub(r"~.*", "", sophora_id)
    if sophora_id == "index":
        sophora_id = f"{node}/{sophora_id}"

    sophora_page = match.group(3)
    if sophora_page is not None:
        sophora_page = int(sophora_page)
    return sophora_id, node, sophora_page
Exemplo n.º 15
0
    def _get_grpc_channel(self):
        endpoint = self.options.get("endpoint", None)
        if endpoint is None:
            endpoint = self.service_metadata.get_all_endpoints_for_group(self.group["group_name"])[0]
        endpoint_object = urlparse(endpoint)
        if endpoint_object.port is not None:
            channel_endpoint = endpoint_object.hostname + ":" + str(endpoint_object.port)
        else:
            channel_endpoint = endpoint_object.hostname

        if endpoint_object.scheme == "http":
            return grpc.insecure_channel(channel_endpoint)
        elif endpoint_object.scheme == "https":
            return grpc.secure_channel(channel_endpoint, grpc.ssl_channel_credentials())
        else:
            raise ValueError('Unsupported scheme in service metadata ("{}")'.format(endpoint_object.scheme))
 def _validate_info(self, broker_info) -> False:
     self.logger.debug("Validating " + broker_info)
     parsed_uri = urlparse(broker_info)
     if not (parsed_uri.scheme in ["http", "https"]):
         return False
     if not parsed_uri.host:
         return False
     self.url = "{}://{}".format(parsed_uri.scheme,
                                 parsed_uri.host if not parsed_uri.port else "{}:{}".format(parsed_uri.host,
                                                                                            parsed_uri.port))
     if not (self.url and self.bucket and self.org and self.token and self.measurement):
         return False
     self.logger.debug(
         "url {}, org: {}, bucket: {}, measurement: {}, token: {}".format(self.url, self.org, self.bucket,
                                                                          self.measurement, self.token))
     return True
Exemplo n.º 17
0
    def _get_base_grpc_channel(self, endpoint):
        endpoint_object = urlparse(endpoint)
        if endpoint_object.port is not None:
            channel_endpoint = endpoint_object.hostname + ":" + str(
                endpoint_object.port)
        else:
            channel_endpoint = endpoint_object.hostname

        if endpoint_object.scheme == "http":
            return grpc.insecure_channel(channel_endpoint)
        elif endpoint_object.scheme == "https":
            return grpc.secure_channel(channel_endpoint,
                                       grpc.ssl_channel_credentials())
        else:
            raise ValueError(
                'Unsupported scheme in service metadata ("{}")'.format(
                    endpoint_object.scheme))
Exemplo n.º 18
0
def setup_acme_client(s3_client, acme_directory_url, acme_account_key):
    uri = rfc3986.urlparse(acme_account_key)
    if uri.scheme == "file":
        with open(uri.path) as f:
            key = f.read()
    elif uri.scheme == "s3":
        # uri.path includes a leading "/"
        response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:])
        key = response["Body"].read()
    else:
        raise ValueError(
            "Invalid acme account key: {!r}".format(acme_account_key))

    key = serialization.load_pem_private_key(key,
                                             password=None,
                                             backend=default_backend())
    return acme_client_for_private_key(acme_directory_url, key)
Exemplo n.º 19
0
def url_path(url_path):
    """Raises an error if the url_path doesn't look like a URL Path."""
    try:
        p_url = rfc3986.urlparse(rfc3986.normalize_uri(url_path))

        invalid_path = (
            p_url.scheme or p_url.userinfo or p_url.host or
            p_url.port or
            p_url.path is None or
            not p_url.path.startswith('/')
        )

        if invalid_path:
            raise exceptions.InvalidURLPath(url_path=url_path)
    except Exception:
        raise exceptions.InvalidURLPath(url_path=url_path)
    return True
Exemplo n.º 20
0
def setup_acme_client(s3_client, acme_directory_url, acme_account_key):
    uri = rfc3986.urlparse(acme_account_key)
    if uri.scheme == "file":
        with open(uri.path) as f:
            key = f.read()
    elif uri.scheme == "s3":
        # uri.path includes a leading "/"
        response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:])
        key = response["Body"].read()
    else:
        raise ValueError(
            "Invalid acme account key: {!r}".format(acme_account_key)
        )

    key = serialization.load_pem_private_key(
        key, password=None, backend=default_backend()
    )
    return acme_client_for_private_key(acme_directory_url, key)
    def _get_grpc_channel(self):
        endpoint = self.options.get("endpoint", None)
        if endpoint is None:
            endpoint = self.metadata["endpoint"]
        endpoint_object = urlparse(endpoint)
        if endpoint_object.port is not None:
            channel_endpoint = endpoint_object.hostname + \
                ":" + str(endpoint_object.port)
        else:
            channel_endpoint = endpoint_object.hostname

        print("Opening grpc to " + channel_endpoint)
        if endpoint_object.scheme == "http":
            return grpc.insecure_channel(channel_endpoint)
        elif endpoint_object.scheme == "https":
            return grpc.secure_channel(channel_endpoint,
                                       grpc.ssl_channel_credentials())
        else:
            raise ValueError(
                'Unsupported scheme in service metadata ("{}")'.format(
                    endpoint_object.scheme))
Exemplo n.º 22
0
    def __init__(self, config, metadata_provider=None):
        self._config = config
        self._metadata_provider = metadata_provider

        # Instantiate Ethereum client
        eth_rpc_endpoint = self._config.get(
            "eth_rpc_endpoint",
            "https://mainnet.infura.io/v3/e7732e1f679e461b9bb4da5653ac3fc2")
        provider = web3.HTTPProvider(eth_rpc_endpoint)
        self.web3 = web3.Web3(provider)
        self.web3.eth.setGasPriceStrategy(medium_gas_price_strategy)

        # Get MPE contract address from config if specified; mostly for local testing
        _mpe_contract_address = self._config.get("mpe_contract_address", None)
        if _mpe_contract_address is None:
            self.mpe_contract = MPEContract(self.web3)
        else:
            self.mpe_contract = MPEContract(self.web3, _mpe_contract_address)

        # Instantiate IPFS client
        ipfs_rpc_endpoint = self._config.get(
            "ipfs_rpc_endpoint", "https://ipfs.singularitynet.io:80")
        ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint)
        ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http"
        ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001
        self.ipfs_client = ipfsapi.connect(
            urljoin(ipfs_scheme, ipfs_rpc_endpoint.hostname), ipfs_port)

        # Get Registry contract address from config if specified; mostly for local testing
        _registry_contract_address = self._config.get(
            "registry_contract_address", None)
        if _registry_contract_address is None:
            self.registry_contract = get_contract_object(
                self.web3, "Registry.json")
        else:
            self.registry_contract = get_contract_object(
                self.web3, "Registry.json", _registry_contract_address)

        self.account = Account(self.web3, config, self.mpe_contract)
def setup_acme_client(s3_client, acme_directory_url, acme_account_key):
    uri = rfc3986.urlparse(acme_account_key)
    if uri.scheme == 'file' or uri.scheme is None:
        if uri.host is None:
            path = uri.path
        elif uri.path is None:
            path = uri.host
        else:
            path = os.path.join(uri.host, uri.path)
        with open(path) as f:
            key = f.read()
    elif uri.scheme == 's3':
        # uri.path includes a leading "/"
        response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:])
        key = response['Body'].read()
    else:
        raise ValueError(
            'Invalid acme account key: {!r}'.format(acme_account_key))

    key = serialization.load_pem_private_key(key.encode("utf-8"),
                                             password=None,
                                             backend=default_backend())
    return acme_client_for_private_key(acme_directory_url, key)
Exemplo n.º 24
0
def _parse_row(element):
    if element[1] == "-":
        return None

    parsed = urlparse(element[0])

    # Apparently sometimes there's no host
    if parsed.host is None or parsed.path is None:
        return None

    # check if url part of property
    if not parsed.host.endswith("wdr.de") or not parsed.path.startswith(
            "/nachrichten"):
        return None

    # get cononical url and get_parameters
    query = parsed.query
    url = parsed.copy_with(query=None, fragment=None).unsplit()

    # parse headline
    headline_raw = html.unescape(element[1].split("_")[-1])
    headline = re.sub(r"<.*?>", "", headline_raw)

    return url, headline, query
Exemplo n.º 25
0
    def __init__(self, config):
        self._config = config

        # Instantiate Ethereum client
        eth_rpc_endpoint = self._config.get("eth_rpc_endpoint",
                                            "https://mainnet.infura.io")
        provider = web3.HTTPProvider(eth_rpc_endpoint)
        self.web3 = web3.Web3(provider)
        self.web3.eth.setGasPriceStrategy(medium_gas_price_strategy)

        self.mpe_contract = MPEContract(self.web3)

        # Instantiate IPFS client
        ipfs_rpc_endpoint = self._config.get(
            "ipfs_rpc_endpoint", "https://ipfs.singularitynet.io:80")
        ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint)
        ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http"
        ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001
        self.ipfs_client = ipfsapi.connect(
            urljoin(ipfs_scheme, ipfs_rpc_endpoint.hostname), ipfs_port)

        self.registry_contract = get_contract_object(self.web3,
                                                     "Registry.json")
        self.account = Account(self.web3, config, self.mpe_contract)
Exemplo n.º 26
0
 def url(self):
     return rfc3986.urlparse(str(self._starlette.url))
Exemplo n.º 27
0
def test_urlparse_a_unicode_hostname():
    url_bytestring = SNOWMAN_HOST
    unicode_url = url_bytestring.decode('utf-8')
    parsed = urlparse(url_bytestring)
    assert parsed.host == unicode_url[7:]
Exemplo n.º 28
0
def test_urlparse_a_unicode_hostname_with_auth():
    url = b'http://userinfo@' + SNOWMAN + b'.com'
    parsed = urlparse(url)
    assert parsed.userinfo == 'userinfo'
Exemplo n.º 29
0
    def prepare_url(self, url, params, validate=False):
        """Prepares the given HTTP URL."""
        # : Accept objects that have string representations.
        #: We're unable to blindly call unicode/str functions
        #: as this will include the bytestring indicator (b'')
        #: on python 3.x.
        #: https://github.com/requests/requests/pull/2238
        if isinstance(url, bytes):
            url = url.decode("utf8")
        else:
            url = str(url)
        # Ignore any leading and trailing whitespace characters.
        url = url.strip()
        # Don't do any URL preparation for non-HTTP schemes like `mailto`,
        # `data` etc to work around exceptions from `url_parse`, which
        # handles RFC 3986 only.
        if ":" in url and not url.lower().startswith("http"):
            self.url = url
            return

        # Support for unicode domain names and paths.
        try:
            uri = rfc3986.urlparse(url)
            if validate:
                rfc3986.normalize_uri(url)
        except rfc3986.exceptions.RFC3986Exception:
            raise InvalidURL(f"Invalid URL {url!r}: URL is imporoper.")

        if not uri.scheme:
            error = (
                "Invalid URL {0!r}: No scheme supplied. Perhaps you meant http://{0}?"
            )
            error = error.format(to_native_string(url, "utf8"))
            raise MissingScheme(error)

        if not uri.host:
            raise InvalidURL(f"Invalid URL {url!r}: No host supplied")

        # In general, we want to try IDNA encoding the hostname if the string contains
        # non-ASCII characters. This allows users to automatically get the correct IDNA
        # behaviour. For strings containing only ASCII characters, we need to also verify
        # it doesn't start with a wildcard (*), before allowing the unencoded hostname.
        if not unicode_is_ascii(uri.host):
            try:
                uri = uri.copy_with(host=self._get_idna_encoded_host(uri.host))
            except UnicodeError:
                raise InvalidURL("URL has an invalid label.")

        elif uri.host.startswith("*"):
            raise InvalidURL("URL has an invalid label.")

        # Bare domains aren't valid URLs.
        if not uri.path:
            uri = uri.copy_with(path="/")
        if isinstance(params, (str, bytes)):
            params = to_native_string(params)
        enc_params = self._encode_params(params)
        if enc_params:
            if uri.query:
                uri = uri.copy_with(query=f"{uri.query}&{enc_params}")
            else:
                uri = uri.copy_with(query=enc_params)
        # url = requote_uri(
        #     urlunparse([uri.scheme, uri.authority, uri.path, None, uri.query, uri.fragment])
        # )
        # Normalize the URI.
        self.url = rfc3986.normalize_uri(uri.unsplit())
Exemplo n.º 30
0
 def url(self):
     """The parsed URL of the Request."""
     return rfc3986.urlparse(self.full_url)
Exemplo n.º 31
0
def test_urlparse_an_invalid_authority_parses_port():
    url = 'http://*****:*****@r@[::1]:80/get'
    parsed = urlparse(url)
    assert parsed.port == 80
    assert parsed.userinfo == 'foo:b@r'
    assert parsed.hostname == '[::1]'
Exemplo n.º 32
0
def test_unsplit_idna_a_unicode_hostname():
    parsed = urlparse(SNOWMAN_HOST)
    assert parsed.unsplit(use_idna=True) == SNOWMAN_IDNA_HOST
Exemplo n.º 33
0
 def _get_ipfs_client(self):
     ipfs_endpoint = urlparse(self.config.get_ipfs_endpoint())
     ipfs_scheme = ipfs_endpoint.scheme if ipfs_endpoint.scheme else "http"
     ipfs_port = ipfs_endpoint.port if ipfs_endpoint.port else 5001
     return ipfsapi.connect(urljoin(ipfs_scheme, ipfs_endpoint.hostname), ipfs_port)
Exemplo n.º 34
0
def test_port_parsing(port):
    with pytest.raises(exceptions.InvalidPort):
        rfc3986.urlparse('https://httpbin.org:{0}/get'.format(port))
Exemplo n.º 35
0
def uri_validator(uri):
    try:
        result = urlparse(uri)
        return all([result.scheme, result.netloc])
    except:
        return False
Exemplo n.º 36
0
def do_parse(uri):
    # Parse the incoming URI
    return rfc3986.urlparse(uri)