예제 #1
0
 def __call__(self, data: str):
     try:
         url = URL(data)
     except ValueError:
         raise ValidationError(_("URL cannot be parsed"),
                               code="parse_error")
     if url.has_query_param('db'):
         if not url.query_param('db').isdigit():
             raise ValidationError(_("Invalid port specified"),
                                   code="invalid_port")
     if url.scheme() == "unix":
         if url.host():
             raise ValidationError(
                 _("Hostname not supported for unix domain sockets"),
                 code="unix_domain_socket_hostname")
         if url.port():
             raise ValidationError(
                 _("Port not supported for unix domain sockets"),
                 code="unix_domain_socket_port")
         if not url.path():
             raise ValidationError(
                 _("No path specified for unix domain socket"),
                 code="unix_domain_socket_path")
     if url.scheme() in ("redis", "redis+tls"):
         if not url.host():
             raise ValidationError(_("No host specified"),
                                   code="host_missing")
예제 #2
0
 def url(self, path='/', **query):
     url = URL(path)
     if not url.host():
         url = url.host(self.host)
     if not url.scheme():
         url = url.scheme('http')
     for k, v in query.items():
         url = url.query_param(k, v)
     return url
예제 #3
0
 def deal_domain(response):
     opt = URL(response.url)
     page_domain = opt.domain()
     scheme = opt.scheme()
     response.page_domain = page_domain
     response.scheme = scheme
     response.page_prefix = response.scheme + '://' + response.page_domain + '/'
예제 #4
0
    def __init__(self, url, save_dir='tmp'):
        """
        @url: full url of a site
        @save_dir: dir to save site
        """
        # log
        self.logger = logger('file', 'sitelog.log', save_dir)
        self.logger.info('-' * 20)
        self.logger.info('start')
        self.logger.info('start func: __init__')
        self.logger.info('url: %s' % url)

        save_time = datetime.strftime(datetime.now(), '%Y%m%d%H%M')
        self.save_time = save_time
        self.save_dir = os.path.abspath(os.path.join(save_dir, save_time))
        # create dir if not exist
        if not os.path.isdir(self.save_dir):
            os.makedirs(self.save_dir)

        self.url = url
        u = URL(url)
        # get host like: http://m.sohu.xom
        self.host = u.scheme() + '://' + u.host()
        print '%s: saving %s' % (save_time, self.url)
        self.logger.info('end func: __init__')
예제 #5
0
    def extract_url(self, response):
        if len(response.all_url) > 0:
            get_domain_list = []
            get_url_list = []
            for url in response.all_url:
                if not url:
                    continue
                end_fix = url[-4:len(url)]
                if '.jpg.png.gif.rar.zip.doc.pdf.css'.find(end_fix) != -1:
                    continue
                opt = URL(url)
                url_domain = opt.domain()
                if not url_domain:
                    url = response.page_prefix + '/' + url
                    url_domain = response.page_domain
                elif not opt.scheme():
                    url = 'http://' + url
                if url_domain.find('eastmoney') == -1:
                    continue
                response.pipe.get(response.spider_name + 'been_url:' + url)
                get_domain_list.append(url_domain)
                get_url_list.append(url)

            for url_domain in get_domain_list:
                response.pipe.get(response.spider_name + 'ban_host:' +
                                  url_domain)

            get_urlex_dmexp_list = response.pipe.execute()
            adv_len = len(get_url_list)
            if len(get_urlex_dmexp_list) == 0 or len(
                    get_urlex_dmexp_list) != adv_len + len(get_domain_list):
                return
            for index in range(len(get_url_list)):
                url = get_url_list[index]
                exist_flag = get_urlex_dmexp_list[index]
                if exist_flag:
                    continue
                is_ban_host = get_urlex_dmexp_list[index + adv_len]
                if is_ban_host:
                    continue

                response.pipe.lpush(self.redis_key, url)
            response.pipe.execute()
        return True
예제 #6
0
class Segments(object):
    """
    URL segment handler, not intended for direct use. The URL is constructed by
    joining base, path and segments.
    """
    def __init__(self, base, path, segments, defaults):
        # Preserve the base URL
        self.base = PURL(base, path=path)
        # Map the segments and defaults lists to an ordered dict
        self.segments = OrderedDict(zip(segments, defaults))

    def build(self):
        # Join base segments and segments
        segments = self.base.path_segments() + tuple(self.segments.values())

        # Create a new URL with the segments replaced
        url = self.base.path_segments(segments)
        return url

    def full_path(self):
        full_path = self.build().as_string()
        full_path = full_path.replace(self.base.host(), '')
        full_path = full_path.replace(self.base.scheme(), '')
        return full_path[4:]

    def __str__(self):
        return self.build().as_string()

    def _get_segment(self, segment):
        return self.segments[segment]

    def _set_segment(self, segment, value):
        self.segments[segment] = value

    @classmethod
    def _segment(cls, segment):
        """
        Returns a property capable of setting and getting a segment.
        """
        return property(
            fget=lambda x: cls._get_segment(x, segment),
            fset=lambda x, v: cls._set_segment(x, segment, v),
        )
예제 #7
0
파일: helpers.py 프로젝트: clld/clld
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
예제 #8
0
    # Do some sanity checks on the config
    requiredAttribs = [
        'serviceName', 'package', 'components', 'configurations'
    ]
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'",
                      attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s',
             service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(),
                           port=ambari_host_uri.port(),
                           protocol=ambari_host_uri.scheme(),
                           username=args.username,
                           password=args.password,
                           identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(
        response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
            extra_config = json.loads(args.extra_config)
            log.debug(
        service_config = config_request.json()
        log.debug('Service config: %s', service_config)
    except:
        log.error("Invalid configuration URI", exc_info=True)
        sys.exit(2)

    # Do some sanity checks on the config
    requiredAttribs = ['serviceName', 'package', 'components', 'configurations']
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'", attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation 
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
            extra_config = json.loads(args.extra_config)
            log.debug('Applying dynamic service configuration specified on command-line: %s', extra_config)
        except:
            log.warning('Extra configuration specified by the -x argument could not be parsed as JSON. The value was \'%s\'. Details: ', args.extra_config, exc_info=True)
            extra_config = {}    
    else:
예제 #10
0
파일: taxa.py 프로젝트: clld/tsammalex-data
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s
예제 #11
0
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
예제 #12
0
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s
예제 #13
0
str_url = URL('https://www.google.com/search?q=google')
print(str_url)
print(str_url.as_string())
argument_url = URL(scheme='https',
                   host='www.google.com',
                   path='/search',
                   query='q=google')
print(argument_url)
print(argument_url.as_string())
inline_url = URL().scheme('https').domain('www.google.com').path(
    'search').query_param('q', 'google')
print(inline_url)
print(inline_url.as_string())

u = URL('postgres://*****:*****@localhost:1234/test?ssl=true')
print(u.scheme())
print(u.host())
print(u.domain())
print(u.username())
print(u.password())
print(u.netloc())
print(u.port())
print(u.path())
print(u.query())
print(u.path_segments())
print(u.query_param('ssl'))
print(u.query_param('ssl', as_list=True))
print(u.query_params())
print(u.has_query_param('ssl'))
print(u.subdomains())