예제 #1
0
    def preprocess_url(self, referrer, url):
        ''' Clean and filter URLs before scraping.
        '''

        ignoreList = ['.pdf', '.jpg', 'tel:', '.dmg']

        if not url:
            return None

        fields = urlsplit(urljoin(
            referrer, url))._asdict()  # convert to absolute URLs and split
        fields['path'] = re.sub(r'/$', '', fields['path'])  # remove trailing /
        fields['fragment'] = ''  # remove targets within a page
        fields = SplitResult(**fields)
        if fields.netloc == self.domain:
            # Scrape pages of current domain only
            if fields.scheme == 'http':
                httpurl = cleanurl = fields.geturl()
                httpsurl = httpurl.replace('http:', 'https:', 1)
            else:
                httpsurl = cleanurl = fields.geturl()
                httpurl = httpsurl.replace('https:', 'http:', 1)

            for item in ignoreList:
                if item in httpsurl or item in httpurl:
                    return None

            if httpurl not in self.urls and httpsurl not in self.urls:
                # Return URL only if it's not already in list
                return cleanurl

        return None
예제 #2
0
def normalize(uristr):
    """
    Translate the given URI into a normalized form.

    :type uristr: unicode
    :rtype: unicode
    """

    # Strip proxy prefix for proxied URLs
    for scheme in URL_SCHEMES:
        if uristr.startswith(VIA_PREFIX + scheme + ":"):
            uristr = uristr[len(VIA_PREFIX):]
            break

    # Try to extract the scheme
    uri = urlsplit(uristr)

    # If this isn't a URL, we don't perform any normalization
    if uri.scheme.lower() not in URL_SCHEMES:
        return uristr

    # Don't perform normalization on URLs with no hostname.
    if uri.hostname is None:
        return uristr

    scheme = _normalize_scheme(uri)
    netloc = _normalize_netloc(uri)
    path = _normalize_path(uri)
    query = _normalize_query(uri)
    fragment = None

    uri = SplitResult(scheme, netloc, path, query, fragment)

    return uri.geturl()
예제 #3
0
    def _add_params_to_url(url, params):
        """Adds parameters as a query part of the URL

        :param url: URL
        :type url: string

        :param params: Dictionary containing parameters
        :type params: Dict

        :return: URL with parameters formatted as a query string
        :rtype: string
        """
        url_parts = urlsplit(url)

        # Extract the existing parameters specified in the redirection URI
        existing_params = parse_qs(url_parts.query)

        # Enrich our custom parameters with the existing ones
        params.update(existing_params)

        new_query = urlencode(params, True)
        url_parts = SplitResult(
            url_parts.scheme,
            url_parts.netloc,
            url_parts.path,
            new_query,
            url_parts.fragment,
        )
        url = url_parts.geturl()

        return url
예제 #4
0
파일: requests.py 프로젝트: kuteken/kobin
 def url(self):
     protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get(
         'wsgi.url_scheme', 'http')
     host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST')
     query_params = self.get("QUERY_STRING")
     url_split_result = SplitResult(protocol, host, self.path, query_params,
                                    '')
     return url_split_result.geturl()
예제 #5
0
파일: app.py 프로젝트: tonywu7/portal5
def resolve_url(requested: SplitResult, *, prefix=''):
    url_ = requested
    if not url_.path:
        url_ = SplitResult(*[*requested[:2], '/', *requested[3:]])

    guard = fetch.guard_incoming_url(url_, request)
    if guard:
        abort(guard)

    if url_.geturl() != request.path[1:]:
        if request.query_string:
            url = urljoin(url_.geturl(),
                          f'?{request.query_string.decode("utf8")}')
        else:
            url = url_.geturl()
        return redirect(f'{request.scheme}://{request.host}{prefix}/{url}',
                        307)

    return url_
예제 #6
0
파일: fetch.py 프로젝트: tonywu7/portal5
def guard_incoming_url(requested: SplitResult, flask_request: Request):
    if requested.scheme not in {'http', 'https'}:
        if not requested.scheme:
            query = flask_request.query_string.decode('utf8')
            requested = f'https:{requested.geturl()}'
            if query:
                requested = f'{requested}?{query}'
            return exceptions.PortalMissingProtocol(requested)
        return exceptions.PortalUnsupportedScheme(requested.scheme)
    if not requested.netloc:
        return exceptions.PortalMissingDomain(requested.geturl())

    return None
    def preprocess_url(self, referrer, url):

        if not url:
            return None

        fields = urlsplit(urljoin(referrer, url))._asdict()
        fields['path'] = re.sub(r'/$', '', fields['path'])
        fields['fragment'] = ''
        fields = SplitResult(**fields)
        if fields.netloc == self.domain:

            if fields.scheme == 'http':
                httpurl = cleanurl = fields.geturl()
                httpsurl = httpurl.replace('http:', 'https:', 1)
            else:
                httpsurl = cleanurl = fields.geturl()
                httpurl = httpsurl.replace('https:', 'http:', 1)
            if httpurl not in self.urls and httpsurl not in self.urls:

                return cleanurl

        return None
예제 #8
0
    def preprocess_url(self, referrer, url):
        ''' 
        Clean and filter URLs before scraping.
        '''
        if not url:
            return None

        fields = urlsplit(urljoin(referrer, url))._asdict()
        fields['path'] = re.sub(r'/$', '', fields['path'])
        fields['fragment'] = ''
        fields = SplitResult(**fields)
        if fields.netloc == self.domain:
            # Scrape pages of current domain only
            if fields.scheme == 'http':
                httpurl = cleanurl = fields.geturl()
                httpsurl = httpurl.replace('http:', 'https:', 1)
            else:
                httpsurl = cleanurl = fields.geturl()
                httpurl = httpsurl.replace('https:', 'http:', 1)
            if httpurl not in self.urls and httpsurl not in self.urls:
                return cleanurl

        return None
    def _get_escaped_full_path(self, request):
        """
        Django considers "safe" some characters that aren't so for oauthlib.
        We have to search for them and properly escape.
        """
        parsed = list(urlparse(request.get_full_path()))
        unsafe = set(c for c in parsed[4]).difference(urlencoded)
        for c in unsafe:
            parsed[4] = parsed[4].replace(c, quote(c, safe=b""))
        uri = urlsplit(urlunparse(parsed))
        query = uri.query
        params = parse_qsl(query)
        encoded_params = urllib_urlencode(params, doseq=False)

        parsed_url = SplitResult(uri.scheme, uri.netloc, uri.path,
                                 encoded_params, uri.fragment)

        return parsed_url.geturl()
def parse_origin(url):
    """
    Return the origin of a URL or None if empty or invalid.

    Per https://tools.ietf.org/html/rfc6454#section-7 :
    Return ``<scheme> + '://' + <host> + <port>``
    for a URL.

    :param url: URL string
    :rtype: str or None
    """

    if url is None:
        return None
    parsed = urlsplit(url)
    # netloc contains both host and port
    origin = SplitResult(parsed.scheme, parsed.netloc, "", "", "")
    return origin.geturl() or None
예제 #11
0
    def run(self):
        """
        Run node, spawning entity and doing other actions as configured in program arguments.

        Returns exit code, 1 for failure, 0 for success
        """
        # Wait for entity to exist if wait flag is enabled
        if self.args.wait:
            self.entity_exists = False

            def entity_cb(entity):
                self.entity_exists = self.args.wait in entity.name

            self.subscription = self.create_subscription(
                ModelStates, '%s/model_states' % self.args.gazebo_namespace,
                entity_cb, 10)

            self.get_logger().info(
                'Waiting for entity {} before proceeding.'.format(
                    self.args.wait))

            while rclpy.ok() and not self.entity_exists:
                rclpy.spin_once(self)
                pass

        # Load entity XML from file
        if self.args.file:
            self.get_logger().info('Loading entity XML from file %s' %
                                   self.args.file)
            if not os.path.exists(self.args.file):
                self.get_logger().error(
                    'Error: specified file %s does not exist', self.args.file)
                return 1
            if not os.path.isfile(self.args.file):
                self.get_logger().error(
                    'Error: specified file %s is not a file', self.args.file)
                return 1
            # load file
            try:
                f = open(self.args.file, 'r')
                entity_xml = f.read()
            except IOError as e:
                self.get_logger().error('Error reading file {}: {}'.format(
                    self.args.file, e))
                return 1
            if entity_xml == '':
                self.get_logger().error('Error: file %s is empty',
                                        self.args.file)
                return 1
        # Load entity XML published on topic specified
        elif self.args.topic:
            self.get_logger().info('Loading entity published on topic %s' %
                                   self.args.topic)
            entity_xml = ''

            def entity_xml_cb(msg):
                nonlocal entity_xml
                entity_xml = msg.data

            self.subscription = self.create_subscription(
                String, self.args.topic, entity_xml_cb,
                QoSDurabilityPolicy.RMW_QOS_POLICY_DURABILITY_TRANSIENT_LOCAL)

            while rclpy.ok() and entity_xml == '':
                self.get_logger().info('Waiting for entity xml on %s' %
                                       self.args.topic)
                rclpy.spin_once(self)
                pass

        # Generate entity XML by putting requested entity name into request template
        elif self.args.database:
            self.get_logger().info(
                'Loading entity XML from Gazebo Model Database')
            entity_xml = self.MODEL_DATABASE_TEMPLATE.format(
                self.args.database)
        elif self.args.stdin:
            self.get_logger().info('Loading entity XML from stdin')
            entity_xml = sys.stdin.read()
            if entity_xml == '':
                self.get_logger().error('Error: stdin buffer was empty')
                return 1

        # Parse xml to detect invalid xml before sending to gazebo
        try:
            xml_parsed = ElementTree.fromstring(entity_xml)
        except ElementTree.ParseError as e:
            self.get_logger().error('Invalid XML: {}'.format(e))
            return 1

        # Replace package:// with model:// for mesh tags if flag is set
        if self.args.package_to_model:
            for element in xml_parsed.iterfind('.//mesh'):
                filename_tag = element.get('filename')
                if filename_tag is None:
                    continue
                url = urlsplit(filename_tag)
                if url.scheme == 'package':
                    url = SplitResult('model', *url[1:])
                    element.set('filename', url.geturl())

        # Encode xml object back into string for service call
        entity_xml = ElementTree.tostring(xml_parsed)

        # Form requested Pose from arguments
        initial_pose = Pose()
        initial_pose.position.x = float(self.args.x)
        initial_pose.position.y = float(self.args.y)
        initial_pose.position.z = float(self.args.z)

        q = quaternion_from_euler(self.args.R, self.args.P, self.args.Y)
        initial_pose.orientation.w = q[0]
        initial_pose.orientation.x = q[1]
        initial_pose.orientation.y = q[2]
        initial_pose.orientation.z = q[3]

        success = self._spawn_entity(entity_xml, initial_pose)
        if not success:
            self.get_logger().error('Spawn service failed. Exiting.')
            return 1

        # TODO(shivesh): Wait for /set_model_configuration
        # (https://github.com/ros-simulation/gazebo_ros_pkgs/issues/779)
        # Apply joint positions if any specified
        # if len(self.args.joints) != 0:
        #     joint_names = [joint[0] for joint in self.args.joints]
        #     joint_positions = [joint[1] for joint in self.args.joints]
        #     success = _set_model_configuration(joint_names, joint_positions)
        #     if not success:
        #         self.get_logger().error('SetModelConfiguration service failed. Exiting.')
        #         return 1

        # Unpause physics if user requested
        if self.args.unpause:
            client = self.create_client(
                Empty, '%s/unpause_physics' % self.args.gazebo_namespace)
            if client.wait_for_service(timeout_sec=self.args.timeout):
                self.get_logger().info('Calling service %s/unpause_physics' %
                                       self.args.gazebo_namespace)
                client.call_async(Empty.Request())
            else:
                self.get_logger().error(
                    'Service %s/unpause_physics unavailable. \
                                         Was Gazebo started with GazeboRosInit?'
                )

        # If bond enabled, setup shutdown callback and wait for shutdown
        if self.args.bond:
            self.get_logger().info(
                'Waiting for shutdown to delete entity [{}]'.format(
                    self.args.entity))
            try:
                rclpy.spin(self)
            except KeyboardInterrupt:
                self.get_logger().info('Ctrl-C detected')
            self._delete_entity()

        return 0
예제 #12
0
    def run(self):
        '''
        Run node, spawning model and doing other actions as configured in program arguments.
        Returns exit code, 1 for failure, 0 for success
        '''
        # Wait for model to exist if wait flag is enabled
        if self.args.wait:
            self.model_exists = False

            def models_cb(models):
                self.model_exists = self.args.wait in models.name

            rospy.Subscriber("%s/model_states" % self.args.gazebo_namespace,
                             ModelStates, models_cb)
            r = rospy.Rate(10)
            rospy.loginfo('Waiting for model {} before proceeding.'.format(
                self.args.wait))
            while not rospy.is_shutdown() and not self.model_exists:
                r.sleep()
            if rospy.is_shutdown():
                return 0

        # Load model XML from file
        if self.args.file:
            rospy.loginfo("Loading model XML from file %s" % self.args.file)
            if not os.path.exists(self.args.file):
                rospy.logfatal("Error: specified file %s does not exist",
                               self.args.file)
                return 1
            if not os.path.isfile(self.args.file):
                rospy.logfatal("Error: specified file %s is not a file",
                               self.args.file)
                return 1
            # load file
            try:
                f = open(self.args.file, 'r')
                model_xml = f.read()
            except IOError as e:
                rospy.logerr("Error reading file {}: {}".format(
                    self.args.file, e))
                return 1
            if model_xml == "":
                rospy.logerr("Error: file %s is empty", self.args.file)
                return 1
        # Load model XML from ROS param
        elif self.args.param:
            rospy.loginfo("Loading model XML from ros parameter %s" %
                          self.args.param)
            model_xml = rospy.get_param(self.args.param)
            if model_xml == "":
                rospy.logerr("Error: param does not exist or is empty")
                return 1
        # Generate model XML by putting requested model name into request template
        elif self.args.database:
            rospy.loginfo("Loading model XML from Gazebo Model Database")
            model_xml = self.MODEL_DATABASE_TEMPLATE.format(self.args.database)
        elif self.args.stdin:
            rospy.loginfo("Loading model XML from stdin")
            model_xml = sys.stdin.read()
            if model_xml == "":
                rospy.logerr("Error: stdin buffer was empty")
                return 1

        # Parse xml to detect invalid xml before sending to gazebo
        try:
            xml_parsed = xml.etree.ElementTree.fromstring(model_xml)
        except xml.etree.ElementTree.ParseError as e:
            rospy.logerr('Invalid XML: {}'.format(e))
            return 1

        # Replace package:// with model:// for mesh tags if flag is set
        if self.args.package_to_model:
            for element in xml_parsed.iterfind('.//mesh'):
                filename_tag = element.get('filename')
                if filename_tag is None:
                    continue
                url = urlsplit(filename_tag)
                if url.scheme == 'package':
                    url = SplitResult('model', *url[1:])
                    element.set('filename', url.geturl())

        # Encode xml object back into string for service call
        model_xml = xml.etree.ElementTree.tostring(xml_parsed)

        # For Python 3
        if not isinstance(model_xml, str):
            model_xml = model_xml.decode(encoding='ascii')

        # Form requested Pose from arguments
        initial_pose = Pose()
        initial_pose.position.x = rospy.get_param('~x_pos')
        initial_pose.position.y = rospy.get_param('~y_pos')
        initial_pose.position.z = self.args.z
        q = quaternion_from_euler(self.args.R, self.args.P, self.args.Y)
        initial_pose.orientation = Quaternion(*q)

        # Spawn model using urdf or sdf service based on arguments
        success = False
        if self.args.urdf:
            success = gazebo_interface.spawn_urdf_model_client(
                self.args.model, model_xml, self.args.robot_namespace,
                initial_pose, self.args.reference_frame,
                self.args.gazebo_namespace)
        elif self.args.sdf:
            success = gazebo_interface.spawn_sdf_model_client(
                self.args.model, model_xml, self.args.robot_namespace,
                initial_pose, self.args.reference_frame,
                self.args.gazebo_namespace)
        if not success:
            rospy.logerr('Spawn service failed. Exiting.')
            return 1

        # Apply joint positions if any specified
        if len(self.args.joints) != 0:
            joint_names = [joint[0] for joint in self.args.joints]
            joint_positions = [joint[1] for joint in self.args.joints]
            success = gazebo_interface.set_model_configuration_client(
                self.args.model, "", joint_names, joint_positions,
                self.args.gazebo_namespace)
            if not success:
                rospy.logerr('SetModelConfiguration service failed. Exiting.')
                return 1

        # Unpause physics if user requested
        if self.args.unpause:
            rospy.loginfo('Unpausing physics')
            rospy.wait_for_service('%s/unpause_physics' %
                                   self.args.gazebo_namespace)
            try:
                unpause_physics = rospy.ServiceProxy(
                    '%s/unpause_physics' % self.args.gazebo_namespace, Empty)
                unpause_physics()
            except rospy.ServiceException as e:
                rospy.logerr(
                    "Unpause physics service call failed: {}".format(e))
                return 1

        # If bond enabled, setup shutdown callback and wait for shutdown
        if self.args.bond:
            rospy.on_shutdown(self._delete_model)
            rospy.loginfo('Waiting for shutdown to delete model {}'.format(
                self.args.model))
            rospy.spin()

        return 0
예제 #13
0
def path_only(url: SplitResult) -> str:
    return url.geturl()[len(f'{url.scheme}://{url.netloc}'):]
예제 #14
0
def no_scheme(url: SplitResult) -> str:
    return url.geturl()[len(f'{url.scheme}:'):]
예제 #15
0
파일: environs.py 프로젝트: c-bata/kobin
 def url(self):
     protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get('wsgi.url_scheme', 'http')
     host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST')
     query_params = self.get("QUERY_STRING")
     url_split_result = SplitResult(protocol, host, self.path, query_params, '')
     return url_split_result.geturl()