def resolve(base, reference, strict=True): """ Resolve a reference URI against a base URI to form a target URI. Implements relative URI resolution according to RFC 3986 section 5.2. "Relative Resolution". Note that urllib.urljoin does not work with non-standard schemes (like our pydata: scheme) hence this implementation... """ if not is_uri(base): raise UriSyntaxError("base was not a valid URI: {0}".format(base)) if not is_uri_reference(reference): raise UriSyntaxError( "reference was not a valid URI-reference: {0}".format(reference)) b, ref = urlsplit(base), urlsplit(reference) scheme, authority, path, query, fragment = None, None, None, None, None if not strict and ref.scheme == b.scheme: ref = SplitResult("", *ref[1:]) if ref.scheme: scheme = ref.scheme authority = ref.netloc path = _remove_dot_segments(ref.path) query = ref.query else: if ref.netloc: authority = ref.netloc path = _remove_dot_segments(ref.path) query = ref.query else: if ref.path == "": path = b.path if ref.query: query = ref.query else: query = b.query else: if ref.path.startswith("/"): path = _remove_dot_segments(ref.path) else: path = _remove_dot_segments(_merge(b, ref.path)) query = ref.query authority = b.netloc scheme = b.scheme fragment = ref.fragment return recombine(SplitResult(scheme, authority, path, query, fragment))
def oembed(url): """ Endpoint to support oEmbed (see https://oembed.com/). Example request:: https://hasjob.co/api/1/oembed?url=https://hasjob.co/ Required for services like embed.ly, which need a registered oEmbed API handler. """ endpoint, view_args = endpoint_for(url) if endpoint not in embed_index_views: return jsonify({}) board = Board.get(view_args.get('subdomain', 'www')) iframeid = 'hasjob-iframe-' + str(uuid4()) parsed_url = urlsplit(url) embed_url = SplitResult( parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.query + ('&' if parsed_url.query else '') + 'embed=1&iframeid=' + iframeid, parsed_url.fragment, ).geturl() return jsonify({ 'provider_url': url_for('index', subdomain=None, _external=True), 'provider_name': app.config['SITE_TITLE'], 'thumbnail_width': 200, 'thumbnail_height': 200, 'thumbnail_url': url_for('static', filename='img/hasjob-logo-200x200.png', _external=True), 'author_name': board.title if board else app.config['SITE_TITLE'], 'author_url': board.url_for(_external=True) if board else url_for('index', subdomain=None, _external=True), 'title': ' | '.join([board.title, board.caption]) if board else app.config['SITE_TITLE'], 'html': ('<iframe id="{iframeid}" src="{url}" ' 'width="100%" height="724" frameborder="0" scrolling="no">'.format( url=embed_url, iframeid=iframeid)), 'version': '1.0', 'type': 'rich', }) return jsonify({})
def create_url(request, swap_scheme=False, swap_origin=False, downgrade=False, query_parameter_to_remove=u"redirection"): parsed = urlsplit(request.url) destination_netloc = parsed.netloc scheme = parsed.scheme if swap_scheme: scheme = u"http" if parsed.scheme == u"https" else u"https" hostname = parsed.netloc.split(u':')[0] port = request.server.config[u"ports"][scheme][0] destination_netloc = u":".join([hostname, str(port)]) if downgrade: # These rely on some unintuitive cleverness due to WPT's test setup: # 'Upgrade-Insecure-Requests' does not upgrade the port number, # so we use URLs in the form `http://[domain]:[https-port]`, # which will be upgraded to `https://[domain]:[https-port]`. # If the upgrade fails, the load will fail, as we don't serve HTTP over # the secure port. if parsed.scheme == u"https": scheme = u"http" elif parsed.scheme == u"wss": scheme = u"ws" else: raise ValueError(u"Downgrade redirection: Invalid scheme '%s'" % parsed.scheme) hostname = parsed.netloc.split(u':')[0] port = request.server.config[u"ports"][parsed.scheme][0] destination_netloc = u":".join([hostname, str(port)]) if swap_origin: destination_netloc = __get_swapped_origin_netloc(destination_netloc) parsed_query = parse_qsl(parsed.query, keep_blank_values=True) parsed_query = [ x for x in parsed_query if x[0] != query_parameter_to_remove ] destination_url = urlunsplit( SplitResult(scheme=scheme, netloc=destination_netloc, path=parsed.path, query=urlencode(parsed_query), fragment=None)) return destination_url
def get_plane_uri(cls, observation_uri, product_id): """ Initializes an Plane URI instance Arguments: observation_uri : the uri of the observation product_id : ID of the product """ caom_util.type_check(observation_uri, ObservationURI, "observation_uri", override=False) caom_util.type_check(product_id, str, "observation_uri", override=False) caom_util.validate_path_component(cls, "product_id", product_id) path = urlsplit(observation_uri.uri).path uri = SplitResult(ObservationURI._SCHEME, "", path + "/" + product_id, "", "").geturl() return cls(uri)
def get_observation_uri(cls, collection, observation_id): """ Initializes an Observation URI instance Arguments: collection : collection observation_id : ID of the observation """ caom_util.type_check(collection, str, "collection", override=False) caom_util.type_check(observation_id, str, "observation_id", override=False) caom_util.validate_path_component(cls, "collection", collection) caom_util.validate_path_component(cls, "observation_id", observation_id) uri = SplitResult(ObservationURI._SCHEME, "", collection + "/" + observation_id, "", "").geturl() return cls(uri)
def ensure_scheme(url, default_scheme='http'): """Adds a scheme to a url if not present. Args: url (string): a url, assumed to start with netloc default_scheme (string): a scheme to be added Returns: string: URL with a scheme """ parsed = urlsplit(url, scheme=default_scheme) if not parsed.netloc: parsed = SplitResult(scheme=parsed.scheme, netloc=parsed.path, path='', query=parsed.query, fragment=parsed.fragment) return urlunsplit(parsed)
def preprocess_url(referrer, url): ''' Clean and filter URLs before scraping. ''' if not url: return None fields = urlsplit(urljoin( referrer, url))._asdict() # convert to absolute URLs and split fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing / fields['fragment'] = '' # remove targets within a page fields = SplitResult(**fields) if fields.netloc == domain and "replytocom" not in url and "comment" not in url: # the replytocom is because of replies to comments in wp sites # Scrape pages of current domain only if fields.scheme == 'http': httpurl = cleanurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) else: httpsurl = cleanurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) if constrain in httpsurl or constrain in httpurl: return cleanurl return None
def handle_distrib(self, url): """React to a file dispatch message.""" url = urlsplit(url) dummy, filename = os.path.split(url.path) LOGGER.debug("filename = %s", filename) # TODO: Should not make any assumptions on filename formats, should # load a description of it from a config file instead. if filename.endswith(".hmf"): risestr, satellite = filename[:-4].split("_", 1) risetime = datetime.strptime(risestr, "%Y%m%d%H%M%S") pname = pass_name(risetime, satellite) satellite = satellite.replace("_", "-") if satellite in self._excluded_platforms: return None swath = self._received_passes.get(pname, {}).copy() swath.pop('satellite', None) swath["platform_name"] = satellite swath["start_time"] = risetime swath["type"] = "binary" swath["format"] = "HRPT" if satellite == "NOAA-15": swath["sensor"] = ("avhrr/3", "amsu-a", "amsu-b", "hirs/3") elif satellite in ["NOAA-18", "NOAA-19"]: swath["sensor"] = ("avhrr/3", "mhs", "amsu-a", "hirs/4") swath["data_processing_level"] = "0" elif filename.startswith("P042") or filename.startswith("P154"): pds = {} pds["format"] = filename[0] pds["apid1"] = filename[1:8] pds["apid2"] = filename[8:15] pds["apid3"] = filename[15:22] pds["time"] = datetime.strptime(filename[22:33], "%y%j%H%M%S") pds["nid"] = filename[33] pds["ufn"] = filename[34:36] pds["extension"] = filename[36:40] risetime = pds["time"] if pds["apid1"][:3] == "042": satellite = "EOS-Terra" pname = pass_name(risetime, 'TERRA') elif pds["apid1"][:3] == "154": satellite = "EOS-Aqua" pname = pass_name(risetime, 'AQUA') else: raise ValueError("Unrecognized satellite ID: " + pds["apid1"][:3]) if not satellite or satellite in self._excluded_platforms: LOGGER.debug("Platform name %s is excluded...", str(satellite)) return None swath = self._received_passes.get(pname, {}).copy() swath.pop('satellite', None) swath['platform_name'] = satellite swath['start_time'] = risetime instruments = { "0064": "modis", "0141": "ceres+y", "0157": "ceres-y", "0261": "amsu-a1", "0262": "amsu-a1", "0290": "amsu-a2", "0342": "hsb", "0402": "amsr-e", "0404": "airs", "0405": "airs", "0406": "airs", "0407": "airs", "0414": "airs", "0415": "airs", "0419": "airs", "0957": "gbad", } swath["sensor"] = instruments.get(pds["apid1"][3:], pds["apid1"][3:]) swath["format"] = "PDS" swath["type"] = "binary" swath["data_processing_level"] = "0" # NPP/JPSS RDRs elif filename.startswith("R") and filename.endswith(".h5"): # Occassionaly RT-STPS produce files with a nonstandard file # naming, lacking the 'RNSCA' field. We will try to deal with this # below (Adam - 2013-06-04): mda = {} mda["format"] = filename[0] file_ok = False for prefix in JPSS_INSTRUMENTS_FROM_FILENAMES: if filename.startswith(prefix): mda["sensor"] = JPSS_INSTRUMENTS_FROM_FILENAMES[prefix] start_time_items = filename.strip(prefix).split('_')[1:3] end_time_item = filename.strip(prefix).split('_')[3] satellite = JPSS_PLATFORM_NAME.get( filename.strip(prefix).split('_')[0], None) orbit = filename.strip(prefix).split('_')[4].strip('b') file_ok = True break if not file_ok: LOGGER.warning("Seems to be a NPP/JPSS RDR " "file but name is not standard!") LOGGER.warning("filename = %s", filename) return None # satellite = "Suomi-NPP, NOAA-20, NOAA-21,..." if not satellite or satellite in self._excluded_platforms: LOGGER.debug("Platform name %s is excluded...", str(satellite)) return None mda["start_time"] = \ datetime.strptime(start_time_items[0] + start_time_items[1], "d%Y%m%dt%H%M%S%f") end_time = \ datetime.strptime(start_time_items[0] + end_time_item, "d%Y%m%de%H%M%S%f") if mda["start_time"] > end_time: end_time += timedelta(days=1) mda["orbit"] = orbit # FIXME: swath start and end time is granule dependent. # Get the end time as well! - Adam 2013-06-03: start_time = mda["start_time"] pname = pass_name(start_time, SCISYS_NAMES.get(satellite)) swath = self._received_passes.get(pname, {}).copy() swath.pop("satellite", None) swath["platform_name"] = satellite swath["start_time"] = start_time swath['end_time'] = end_time swath["sensor"] = mda["sensor"] swath["format"] = "RDR" swath["type"] = "HDF5" swath["data_processing_level"] = "0" # metop elif filename[4:12] == "_HRP_00_": # "AVHR": "avhrr", instruments = { "ASCA": "ascat", "AMSA": "amsu-a", "ATOV": "atovs", "AVHR": "avhrr/3", "GOME": "gome", "GRAS": "gras", "HIRS": "hirs/4", "IASI": "iasi", "MHSx": "mhs", "SEMx": "sem", "ADCS": "adcs", "SBUV": "sbuv", "HKTM": "vcdu34" } satellites = {"M02": "Metop-A", "M01": "Metop-B", "M03": "Metop-C"} satellite = satellites[filename[12:15]] risetime = datetime.strptime(filename[16:31], "%Y%m%d%H%M%SZ") falltime = datetime.strptime(filename[32:47], "%Y%m%d%H%M%SZ") pname = pass_name(risetime, satellite.upper()) LOGGER.debug("pname= % s", str(pname)) swath = self._received_passes.get(pname, {}).copy() swath.pop('satellite', None) swath["start_time"] = risetime swath["end_time"] = falltime swath["platform_name"] = satellite swath["sensor"] = instruments[filename[:4]] swath["format"] = "EPS" swath["type"] = "binary" swath["data_processing_level"] = "0" else: return None if url.scheme in ["", "file"]: scheme = "ssh" netloc = self._emitter uri = urlunsplit( SplitResult(scheme, netloc, url.path, url.query, url.fragment)) elif url.scheme == "ftp": scheme = "ssh" netloc = url.hostname uri = urlunsplit( SplitResult(scheme, netloc, url.path, url.query, url.fragment)) else: LOGGER.debug("url.scheme not expected: %s", url.scheme) swath["uid"] = os.path.split(url.path)[1] swath["uri"] = uri swath['variant'] = 'DR' return swath
# default, via the "zulipdev.com" hostname. EXTERNAL_HOST = 'zulipdev.com:9991' # Serve the main dev realm at the literal name "localhost", # so it works out of the box even when not on the Internet. REALM_HOSTS = { 'zulip': 'localhost:9991', } else: EXTERNAL_HOST = external_host_env REALM_HOSTS = { 'zulip': EXTERNAL_HOST, } # TODO: Replace with scripts.lib.zulip_tools.deport when this no longer needs to # be Python 2 compatible for zthumbor. r = SplitResult("", EXTERNAL_HOST, "", "", "") assert r.hostname is not None EXTERNAL_HOST_WITHOUT_PORT = "[" + r.hostname + "]" if ":" in r.hostname else r.hostname ALLOWED_HOSTS = ['*'] # Uncomment extra backends if you want to test with them. Note that # for Google and GitHub auth you'll need to do some pre-setup. AUTHENTICATION_BACKENDS = ( 'zproject.backends.DevAuthBackend', 'zproject.backends.EmailAuthBackend', 'zproject.backends.GitHubAuthBackend', 'zproject.backends.GoogleAuthBackend', 'zproject.backends.SAMLAuthBackend', # 'zproject.backends.AzureADAuthBackend', 'zproject.backends.GitLabAuthBackend',