def __call__(self, data: str): try: url = URL(data) except ValueError: raise ValidationError(_("URL cannot be parsed"), code="parse_error") if url.has_query_param('db'): if not url.query_param('db').isdigit(): raise ValidationError(_("Invalid port specified"), code="invalid_port") if url.scheme() == "unix": if url.host(): raise ValidationError( _("Hostname not supported for unix domain sockets"), code="unix_domain_socket_hostname") if url.port(): raise ValidationError( _("Port not supported for unix domain sockets"), code="unix_domain_socket_port") if not url.path(): raise ValidationError( _("No path specified for unix domain socket"), code="unix_domain_socket_path") if url.scheme() in ("redis", "redis+tls"): if not url.host(): raise ValidationError(_("No host specified"), code="host_missing")
def _collect_external_links(self, response): """Сборщик внешних ссылок """ for link in response.html.absolute_links: url = URL(link) if self.domain not in url.host(): self.external_links.add(url.host())
def url(self, path='/', **query): url = URL(path) if not url.host(): url = url.host(self.host) if not url.scheme(): url = url.scheme('http') for k, v in query.items(): url = url.query_param(k, v) return url
def __init__(self, url, save_dir='tmp'): """ @url: full url of a site @save_dir: dir to save site """ # log self.logger = logger('file', 'sitelog.log', save_dir) self.logger.info('-' * 20) self.logger.info('start') self.logger.info('start func: __init__') self.logger.info('url: %s' % url) save_time = datetime.strftime(datetime.now(), '%Y%m%d%H%M') self.save_time = save_time self.save_dir = os.path.abspath(os.path.join(save_dir, save_time)) # create dir if not exist if not os.path.isdir(self.save_dir): os.makedirs(self.save_dir) self.url = url u = URL(url) # get host like: http://m.sohu.xom self.host = u.scheme() + '://' + u.host() print '%s: saving %s' % (save_time, self.url) self.logger.info('end func: __init__')
def cc_link(req, license_url, button="regular"): if license_url == "http://en.wikipedia.org/wiki/Public_domain": license_url = "http://creativecommons.org/publicdomain/zero/1.0/" license_url = URL(license_url) if license_url.host() != "creativecommons.org": return comps = license_url.path().split("/") if len(comps) < 3: return # pragma: no cover known = { "zero": "Public Domain", "by": "Creative Commons Attribution License", "by-nc": "Creative Commons Attribution-NonCommercial License", "by-nc-nd": "Creative Commons Attribution-NonCommercial-NoDerivatives License", "by-nc-sa": "Creative Commons Attribution-NonCommercial-ShareAlike License", "by-nd": "Creative Commons Attribution-NoDerivatives License", "by-sa": "Creative Commons Attribution-ShareAlike License", } if comps[2] not in known: return icon = "cc-" + comps[2] + ("-small" if button == "small" else "") + ".png" img_attrs = dict(alt=known[comps[2]], src=req.static_url("clld:web/static/images/" + icon)) height, width = (15, 80) if button == "small" else (30, 86) img_attrs.update(height=height, width=width) return HTML.a(HTML.img(**img_attrs), href=license_url, rel="license")
def cc_link(req, license_url, button='regular'): if license_url == 'https://en.wikipedia.org/wiki/Public_domain': license_url = 'https://creativecommons.org/publicdomain/zero/1.0/' license_url = URL(license_url) if license_url.host() != 'creativecommons.org': return comps = license_url.path().split('/') if len(comps) < 3: return # pragma: no cover known = { 'zero': 'Public Domain', 'by': 'Creative Commons Attribution License', 'by-nc': 'Creative Commons Attribution-NonCommercial License', 'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License', 'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License', 'by-nd': 'Creative Commons Attribution-NoDerivatives License', 'by-sa': 'Creative Commons Attribution-ShareAlike License'} if comps[2] not in known: return icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png' img_attrs = dict( alt=known[comps[2]], src=req.static_url('clld:web/static/images/' + icon)) height, width = (15, 80) if button == 'small' else (30, 86) img_attrs.update(height=height, width=width) return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
def get_bib(args): uploaded = load(args.data_file('repos', 'cdstar.json')) fname_to_cdstar = {} for type_ in ['texts', 'docs', 'data']: for hash_, paths in load(args.data_file('repos', type_ + '.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] for hash_, paths in load(args.data_file('repos', 'edmond.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True) for rec in db: doc = Document(rec) newurls = [] for url in rec.get('url', '').split(';'): if not url.strip(): continue if url.endswith('sequence=1'): newurls.append(url) continue url = URL(url.strip()) if url.host() in ['dogonlanguages.org', 'github.com', '']: fname = url.path().split('/')[-1] doc.files.append((fname, fname_to_cdstar[fname])) else: newurls.append(url.as_string()) doc.rec['url'] = '; '.join(newurls) yield doc
def cc_link(req, license_url, button='regular'): if license_url == 'https://en.wikipedia.org/wiki/Public_domain': license_url = 'https://creativecommons.org/publicdomain/zero/1.0/' # pragma: no cover license_url = URL(license_url) if license_url.host() != 'creativecommons.org': return comps = license_url.path().split('/') if len(comps) < 3: return # pragma: no cover known = { 'zero': 'Public Domain', 'by': 'Creative Commons Attribution License', 'by-nc': 'Creative Commons Attribution-NonCommercial License', 'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License', 'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License', 'by-nd': 'Creative Commons Attribution-NoDerivatives License', 'by-sa': 'Creative Commons Attribution-ShareAlike License'} if comps[2] not in known: return icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png' img_attrs = dict( alt=known[comps[2]], src=req.static_url('clld:web/static/images/' + icon)) height, width = (15, 80) if button == 'small' else (30, 86) img_attrs.update(height=height, width=width) return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
def info(self, url): """Interface method to be called when processing new images. This method ties together the DataProvider workflow. """ url = URL(url) return self.postprocess( self.info_for_id(self.id_from_url(url, url.host(), url.path_segments())))
def maybe_license_link(req, license, **kw): cc_link_ = cc_link(req, license, button=kw.pop('button', 'regular')) if cc_link_: return cc_link_ license_url = URL(license) if license_url.host(): return external_link(license_url, **kw) return license
def get_video_id_from_url(self, video_url): video_url = URL(video_url) if 'youtube' not in video_url.host(): raise DownytError( 'Provided URL is not from YouTube: {}'.format(video_url) ) return video_url.query_param('v')
def blog_feed(request): """ Proxy feeds from the blog, so they can be accessed via XHR requests. We also convert RSS to ATOM so that clld's javascript Feed component can read them. """ if not request.params.get('path'): raise HTTPNotFound() path = URL(request.params['path']) assert not path.host() try: return atom_feed(request, request.blog.url(path.as_string())) except ConnectionError: # pragma: no cover raise HTTPNotFound()
def blog_feed(request): """ Proxy feeds from the blog, so they can be accessed via XHR requests. We also convert RSS to ATOM so that clld's javascript Feed component can read them. """ if not request.params.get('path'): raise HTTPNotFound() path = URL(request.params['path']) assert not path.host() try: return atom_feed(request, request.blog.url(path.as_string())) except ConnectionError: raise HTTPNotFound()
def license_name(license_url): if license_url == "http://commons.wikimedia.org/wiki/GNU_Free_Documentation_License": return 'GNU Free Documentation License' if license_url == 'http://en.wikipedia.org/wiki/Public_domain': license_url = 'http://creativecommons.org/publicdomain/zero/1.0/' license_url_ = URL(license_url) if license_url_.host() != 'creativecommons.org': return license_url comps = license_url_.path().split('/') if len(comps) < 3: return license_url return { 'zero': 'Public Domain', }.get(comps[2], '(CC) %s' % comps[2].upper())
class Segments(object): """ URL segment handler, not intended for direct use. The URL is constructed by joining base, path and segments. """ def __init__(self, base, path, segments, defaults): # Preserve the base URL self.base = PURL(base, path=path) # Map the segments and defaults lists to an ordered dict self.segments = OrderedDict(zip(segments, defaults)) def build(self): # Join base segments and segments segments = self.base.path_segments() + tuple(self.segments.values()) # Create a new URL with the segments replaced url = self.base.path_segments(segments) return url def full_path(self): full_path = self.build().as_string() full_path = full_path.replace(self.base.host(), '') full_path = full_path.replace(self.base.scheme(), '') return full_path[4:] def __str__(self): return self.build().as_string() def _get_segment(self, segment): return self.segments[segment] def _set_segment(self, segment, value): self.segments[segment] = value @classmethod def _segment(cls, segment): """ Returns a property capable of setting and getting a segment. """ return property( fget=lambda x: cls._get_segment(x, segment), fset=lambda x, v: cls._set_segment(x, segment, v), )
def main(args): def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes(data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace('/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict(url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files(object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def get_image_info(img): for field in ['source', 'source_url', 'id']: for provider in PROVIDERS: url = URL(img[field]) if provider.id_from_url(url, url.host(), url.path_segments()): return provider.info(img[field])
def maybe_external_link(text, **kw): url = URL(text) if url.host() and url.scheme() in ['http', 'https']: return external_link(text, **kw) return text
sys.exit(2) # Do some sanity checks on the config requiredAttribs = [ 'serviceName', 'package', 'components', 'configurations' ] for attrib in requiredAttribs: if not attrib in service_config: log.error("Invalid configuration. Missing required attribute '%s'", attrib) sys.exit(3) log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host) ambari_host_uri = URL(args.ambari_host) ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps') # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses ambari_client.client.request_params['hooks'] = dict( response=shared_lib.Fixup(ambari_host_uri).fixup) # Assume we only have 1 cluster managed by this Ambari installation cluster = ambari_client.clusters.next() log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href) # Pull in any extra dynamic configuration if args.extra_config: try:
def main(args): if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
service_config = config_request.json() log.debug('Service config: %s', service_config) except: log.error("Invalid configuration URI", exc_info=True) sys.exit(2) # Do some sanity checks on the config requiredAttribs = ['serviceName', 'package', 'components', 'configurations'] for attrib in requiredAttribs: if not attrib in service_config: log.error("Invalid configuration. Missing required attribute '%s'", attrib) sys.exit(3) log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host) ambari_host_uri = URL(args.ambari_host) ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps') # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses ambari_client.client.request_params['hooks'] = dict(response=shared_lib.Fixup(ambari_host_uri).fixup) # Assume we only have 1 cluster managed by this Ambari installation cluster = ambari_client.clusters.next() log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href) # Pull in any extra dynamic configuration if args.extra_config: try: extra_config = json.loads(args.extra_config) log.debug('Applying dynamic service configuration specified on command-line: %s', extra_config) except: log.warning('Extra configuration specified by the -x argument could not be parsed as JSON. The value was \'%s\'. Details: ', args.extra_config, exc_info=True) extra_config = {} else:
def wikipedia_url(s): # pragma: no cover url = URL(s) if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host(): return s
def url_parts(self, url): url = URL(url) return url, url.host(), url.path_segments()
print(str_url) print(str_url.as_string()) argument_url = URL(scheme='https', host='www.google.com', path='/search', query='q=google') print(argument_url) print(argument_url.as_string()) inline_url = URL().scheme('https').domain('www.google.com').path( 'search').query_param('q', 'google') print(inline_url) print(inline_url.as_string()) u = URL('postgres://*****:*****@localhost:1234/test?ssl=true') print(u.scheme()) print(u.host()) print(u.domain()) print(u.username()) print(u.password()) print(u.netloc()) print(u.port()) print(u.path()) print(u.query()) print(u.path_segments()) print(u.query_param('ssl')) print(u.query_param('ssl', as_list=True)) print(u.query_params()) print(u.has_query_param('ssl')) print(u.subdomains()) u = URL.from_string('https://github.com/minwook-shin')