Пример #1
0
 def __call__(self, data: str):
     try:
         url = URL(data)
     except ValueError:
         raise ValidationError(_("URL cannot be parsed"),
                               code="parse_error")
     if url.has_query_param('db'):
         if not url.query_param('db').isdigit():
             raise ValidationError(_("Invalid port specified"),
                                   code="invalid_port")
     if url.scheme() == "unix":
         if url.host():
             raise ValidationError(
                 _("Hostname not supported for unix domain sockets"),
                 code="unix_domain_socket_hostname")
         if url.port():
             raise ValidationError(
                 _("Port not supported for unix domain sockets"),
                 code="unix_domain_socket_port")
         if not url.path():
             raise ValidationError(
                 _("No path specified for unix domain socket"),
                 code="unix_domain_socket_path")
     if url.scheme() in ("redis", "redis+tls"):
         if not url.host():
             raise ValidationError(_("No host specified"),
                                   code="host_missing")
Пример #2
0
 def _collect_external_links(self, response):
     """Сборщик внешних ссылок
     """
     for link in response.html.absolute_links:
         url = URL(link)
         if self.domain not in url.host():
             self.external_links.add(url.host())
Пример #3
0
 def url(self, path='/', **query):
     url = URL(path)
     if not url.host():
         url = url.host(self.host)
     if not url.scheme():
         url = url.scheme('http')
     for k, v in query.items():
         url = url.query_param(k, v)
     return url
Пример #4
0
    def __init__(self, url, save_dir='tmp'):
        """
        @url: full url of a site
        @save_dir: dir to save site
        """
        # log
        self.logger = logger('file', 'sitelog.log', save_dir)
        self.logger.info('-' * 20)
        self.logger.info('start')
        self.logger.info('start func: __init__')
        self.logger.info('url: %s' % url)

        save_time = datetime.strftime(datetime.now(), '%Y%m%d%H%M')
        self.save_time = save_time
        self.save_dir = os.path.abspath(os.path.join(save_dir, save_time))
        # create dir if not exist
        if not os.path.isdir(self.save_dir):
            os.makedirs(self.save_dir)

        self.url = url
        u = URL(url)
        # get host like: http://m.sohu.xom
        self.host = u.scheme() + '://' + u.host()
        print '%s: saving %s' % (save_time, self.url)
        self.logger.info('end func: __init__')
Пример #5
0
def cc_link(req, license_url, button="regular"):
    if license_url == "http://en.wikipedia.org/wiki/Public_domain":
        license_url = "http://creativecommons.org/publicdomain/zero/1.0/"
    license_url = URL(license_url)
    if license_url.host() != "creativecommons.org":
        return

    comps = license_url.path().split("/")
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        "zero": "Public Domain",
        "by": "Creative Commons Attribution License",
        "by-nc": "Creative Commons Attribution-NonCommercial License",
        "by-nc-nd": "Creative Commons Attribution-NonCommercial-NoDerivatives License",
        "by-nc-sa": "Creative Commons Attribution-NonCommercial-ShareAlike License",
        "by-nd": "Creative Commons Attribution-NoDerivatives License",
        "by-sa": "Creative Commons Attribution-ShareAlike License",
    }
    if comps[2] not in known:
        return

    icon = "cc-" + comps[2] + ("-small" if button == "small" else "") + ".png"
    img_attrs = dict(alt=known[comps[2]], src=req.static_url("clld:web/static/images/" + icon))
    height, width = (15, 80) if button == "small" else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel="license")
Пример #6
0
def cc_link(req, license_url, button='regular'):
    if license_url == 'https://en.wikipedia.org/wiki/Public_domain':
        license_url = 'https://creativecommons.org/publicdomain/zero/1.0/'
    license_url = URL(license_url)
    if license_url.host() != 'creativecommons.org':
        return

    comps = license_url.path().split('/')
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        'zero': 'Public Domain',
        'by': 'Creative Commons Attribution License',
        'by-nc': 'Creative Commons Attribution-NonCommercial License',
        'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License',
        'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License',
        'by-nd': 'Creative Commons Attribution-NoDerivatives License',
        'by-sa': 'Creative Commons Attribution-ShareAlike License'}
    if comps[2] not in known:
        return

    icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png'
    img_attrs = dict(
        alt=known[comps[2]],
        src=req.static_url('clld:web/static/images/' + icon))
    height, width = (15, 80) if button == 'small' else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
Пример #7
0
def get_bib(args):
    uploaded = load(args.data_file('repos', 'cdstar.json'))
    fname_to_cdstar = {}
    for type_ in ['texts', 'docs', 'data']:
        for hash_, paths in load(args.data_file('repos', type_ + '.json')).items():
            if hash_ in uploaded:
                for path in paths:
                    fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    for hash_, paths in load(args.data_file('repos', 'edmond.json')).items():
        if hash_ in uploaded:
            for path in paths:
                fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True)
    for rec in db:
        doc = Document(rec)
        newurls = []
        for url in rec.get('url', '').split(';'):
            if not url.strip():
                continue
            if url.endswith('sequence=1'):
                newurls.append(url)
                continue
            url = URL(url.strip())
            if url.host() in ['dogonlanguages.org', 'github.com', '']:
                fname = url.path().split('/')[-1]
                doc.files.append((fname, fname_to_cdstar[fname]))
            else:
                newurls.append(url.as_string())
        doc.rec['url'] = '; '.join(newurls)
        yield doc
Пример #8
0
def cc_link(req, license_url, button='regular'):
    if license_url == 'https://en.wikipedia.org/wiki/Public_domain':
        license_url = 'https://creativecommons.org/publicdomain/zero/1.0/'  # pragma: no cover
    license_url = URL(license_url)
    if license_url.host() != 'creativecommons.org':
        return

    comps = license_url.path().split('/')
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        'zero': 'Public Domain',
        'by': 'Creative Commons Attribution License',
        'by-nc': 'Creative Commons Attribution-NonCommercial License',
        'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License',
        'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License',
        'by-nd': 'Creative Commons Attribution-NoDerivatives License',
        'by-sa': 'Creative Commons Attribution-ShareAlike License'}
    if comps[2] not in known:
        return

    icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png'
    img_attrs = dict(
        alt=known[comps[2]],
        src=req.static_url('clld:web/static/images/' + icon))
    height, width = (15, 80) if button == 'small' else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
Пример #9
0
    def info(self, url):
        """Interface method to be called when processing new images.

        This method ties together the DataProvider workflow.
        """
        url = URL(url)
        return self.postprocess(
            self.info_for_id(self.id_from_url(url, url.host(), url.path_segments())))
Пример #10
0
def maybe_license_link(req, license, **kw):
    cc_link_ = cc_link(req, license, button=kw.pop('button', 'regular'))
    if cc_link_:
        return cc_link_
    license_url = URL(license)
    if license_url.host():
        return external_link(license_url, **kw)
    return license
Пример #11
0
def maybe_license_link(req, license, **kw):
    cc_link_ = cc_link(req, license, button=kw.pop('button', 'regular'))
    if cc_link_:
        return cc_link_
    license_url = URL(license)
    if license_url.host():
        return external_link(license_url, **kw)
    return license
Пример #12
0
Файл: cli.py Проект: sesh/downyt
    def get_video_id_from_url(self, video_url):
        video_url = URL(video_url)

        if 'youtube' not in video_url.host():
            raise DownytError(
                'Provided URL is not from YouTube: {}'.format(video_url)
            )

        return video_url.query_param('v')
Пример #13
0
def blog_feed(request):
    """
    Proxy feeds from the blog, so they can be accessed via XHR requests.

    We also convert RSS to ATOM so that clld's javascript Feed component can read them.
    """
    if not request.params.get('path'):
        raise HTTPNotFound()
    path = URL(request.params['path'])
    assert not path.host()
    try:
        return atom_feed(request, request.blog.url(path.as_string()))
    except ConnectionError:  # pragma: no cover
        raise HTTPNotFound()
Пример #14
0
def blog_feed(request):
    """
    Proxy feeds from the blog, so they can be accessed via XHR requests.

    We also convert RSS to ATOM so that clld's javascript Feed component can read them.
    """
    if not request.params.get('path'):
        raise HTTPNotFound()
    path = URL(request.params['path'])
    assert not path.host()
    try:
        return atom_feed(request, request.blog.url(path.as_string()))
    except ConnectionError:
        raise HTTPNotFound()
Пример #15
0
def license_name(license_url):
    if license_url == "http://commons.wikimedia.org/wiki/GNU_Free_Documentation_License":
        return 'GNU Free Documentation License'
    if license_url == 'http://en.wikipedia.org/wiki/Public_domain':
        license_url = 'http://creativecommons.org/publicdomain/zero/1.0/'
    license_url_ = URL(license_url)
    if license_url_.host() != 'creativecommons.org':
        return license_url

    comps = license_url_.path().split('/')
    if len(comps) < 3:
        return license_url

    return {
        'zero': 'Public Domain',
    }.get(comps[2], '(CC) %s' % comps[2].upper())
Пример #16
0
def license_name(license_url):
    if license_url == "http://commons.wikimedia.org/wiki/GNU_Free_Documentation_License":
        return 'GNU Free Documentation License'
    if license_url == 'http://en.wikipedia.org/wiki/Public_domain':
        license_url = 'http://creativecommons.org/publicdomain/zero/1.0/'
    license_url_ = URL(license_url)
    if license_url_.host() != 'creativecommons.org':
        return license_url

    comps = license_url_.path().split('/')
    if len(comps) < 3:
        return license_url

    return {
        'zero': 'Public Domain',
    }.get(comps[2], '(CC) %s' % comps[2].upper())
Пример #17
0
class Segments(object):
    """
    URL segment handler, not intended for direct use. The URL is constructed by
    joining base, path and segments.
    """
    def __init__(self, base, path, segments, defaults):
        # Preserve the base URL
        self.base = PURL(base, path=path)
        # Map the segments and defaults lists to an ordered dict
        self.segments = OrderedDict(zip(segments, defaults))

    def build(self):
        # Join base segments and segments
        segments = self.base.path_segments() + tuple(self.segments.values())

        # Create a new URL with the segments replaced
        url = self.base.path_segments(segments)
        return url

    def full_path(self):
        full_path = self.build().as_string()
        full_path = full_path.replace(self.base.host(), '')
        full_path = full_path.replace(self.base.scheme(), '')
        return full_path[4:]

    def __str__(self):
        return self.build().as_string()

    def _get_segment(self, segment):
        return self.segments[segment]

    def _set_segment(self, segment, value):
        self.segments[segment] = value

    @classmethod
    def _segment(cls, segment):
        """
        Returns a property capable of setting and getting a segment.
        """
        return property(
            fget=lambda x: cls._get_segment(x, segment),
            fset=lambda x, v: cls._set_segment(x, segment, v),
        )
Пример #18
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Пример #19
0
def get_image_info(img):
    for field in ['source', 'source_url', 'id']:
        for provider in PROVIDERS:
            url = URL(img[field])
            if provider.id_from_url(url, url.host(), url.path_segments()):
                return provider.info(img[field])
Пример #20
0
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
Пример #21
0
        sys.exit(2)

    # Do some sanity checks on the config
    requiredAttribs = [
        'serviceName', 'package', 'components', 'configurations'
    ]
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'",
                      attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s',
             service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(),
                           port=ambari_host_uri.port(),
                           protocol=ambari_host_uri.scheme(),
                           username=args.username,
                           password=args.password,
                           identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(
        response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
Пример #22
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
        service_config = config_request.json()
        log.debug('Service config: %s', service_config)
    except:
        log.error("Invalid configuration URI", exc_info=True)
        sys.exit(2)

    # Do some sanity checks on the config
    requiredAttribs = ['serviceName', 'package', 'components', 'configurations']
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'", attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation 
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
            extra_config = json.loads(args.extra_config)
            log.debug('Applying dynamic service configuration specified on command-line: %s', extra_config)
        except:
            log.warning('Extra configuration specified by the -x argument could not be parsed as JSON. The value was \'%s\'. Details: ', args.extra_config, exc_info=True)
            extra_config = {}    
    else:
Пример #24
0
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s
Пример #25
0
 def url_parts(self, url):
     url = URL(url)
     return url, url.host(), url.path_segments()
Пример #26
0
print(str_url)
print(str_url.as_string())
argument_url = URL(scheme='https',
                   host='www.google.com',
                   path='/search',
                   query='q=google')
print(argument_url)
print(argument_url.as_string())
inline_url = URL().scheme('https').domain('www.google.com').path(
    'search').query_param('q', 'google')
print(inline_url)
print(inline_url.as_string())

u = URL('postgres://*****:*****@localhost:1234/test?ssl=true')
print(u.scheme())
print(u.host())
print(u.domain())
print(u.username())
print(u.password())
print(u.netloc())
print(u.port())
print(u.path())
print(u.query())
print(u.path_segments())
print(u.query_param('ssl'))
print(u.query_param('ssl', as_list=True))
print(u.query_params())
print(u.has_query_param('ssl'))
print(u.subdomains())

u = URL.from_string('https://github.com/minwook-shin')
Пример #27
0
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
Пример #28
0
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s