def _get_remote_projects(self): headers = {"Accept": "text/html"} # use a minimum of 30 seconds as timeout for remote server and # 60s when running as replica, because the list can be quite large # and the master might take a while to process it if self.xom.is_replica(): timeout = max(self.timeout, 60) else: timeout = max(self.timeout, 30) response = self.httpget( self.mirror_url, allow_redirects=True, extra_headers=headers, timeout=timeout) if response.status_code != 200: raise self.UpstreamError("URL %r returned %s %s", self.mirror_url, response.status_code, response.reason) page = HTMLPage(response.text, response.url) projects = set() baseurl = URL(response.url) basehost = baseurl.replace(path='') for link in page.links: newurl = URL(link.url) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): continue if not newurl.path.startswith(baseurl.path): continue if basehost != newurl.replace(path=''): continue projects.add(newurl.basename) return projects
def _get_normalized_url(self, url): # returns url with port always included url = URL(url) if ':' not in url.netloc: if url.scheme == 'http': url = url.replace(netloc="%s:80" % url.netloc) elif url.scheme == 'https': url = url.replace(netloc="%s:443" % url.netloc) return url.url
def simpleindex_auth(self): indexserver = URL(self.simpleindex) basic_auth = self.get_basic_auth(indexserver) if basic_auth: indexserver = indexserver.replace( netloc="%s@%s" % (':'.join(basic_auth), indexserver.netloc)) return indexserver.url
class ProjectParser(HTMLParser): def __init__(self, url): HTMLParser.__init__(self) self.projects = set() self.baseurl = URL(url) self.basehost = self.baseurl.replace(path='') self.project = None def handle_starttag(self, tag, attrs): if tag == 'a': self.project = None attrs = dict(attrs) if 'href' not in attrs: return href = attrs['href'] if '://' not in href: project = href.rstrip('/').rsplit('/', 1)[-1] else: newurl = self.baseurl.joinpath(href) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): return if not newurl.path.startswith(self.baseurl.path): return if self.basehost != newurl.replace(path=''): return project = newurl.basename self.project = project def handle_endtag(self, tag): if tag == 'a' and self.project: self.projects.add(self.project) self.project = None
def test_replace(self): url = URL("http://qwe/foo?bar=ham#hash") assert url.replace( scheme='https').url == "https://qwe/foo?bar=ham#hash" assert url.replace(scheme='').url == "//qwe/foo?bar=ham#hash" assert url.replace( netloc='world').url == "http://world/foo?bar=ham#hash" assert url.replace(netloc='').url == "http:///foo?bar=ham#hash" assert url.replace(path='/').url == "http://qwe/?bar=ham#hash" assert url.replace(path='').url == "http://qwe?bar=ham#hash" assert url.replace(query='').url == "http://qwe/foo#hash" assert url.replace(fragment='').url == "http://qwe/foo?bar=ham" assert url.replace(fragment='foo').url == "http://qwe/foo?bar=ham#foo" # original shouldn't have changed assert url.url == "http://qwe/foo?bar=ham#hash" # trying to change something not existing does nothing assert url.replace(foo='https').url == "http://qwe/foo?bar=ham#hash"
def _get_remote_projects(self): headers = {"Accept": "text/html"} response = self.httpget(self.mirror_url, allow_redirects=True, extra_headers=headers) if response.status_code != 200: raise self.UpstreamError("URL %r returned %s", self.mirror_url, response.status_code) page = HTMLPage(response.text, response.url) projects = set() baseurl = URL(response.url) basehost = baseurl.replace(path='') for link in page.links: newurl = URL(link.url) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): continue if not newurl.path.startswith(baseurl.path): continue if basehost != newurl.replace(path=''): continue projects.add(newurl.basename) return projects
def _get_remote_projects(self): headers = {"Accept": "text/html"} # use a minimum of 30 seconds as timeout for remote server and # 60s when running as replica, because the list can be quite large # and the master might take a while to process it if self.xom.is_replica(): timeout = max(self.timeout, 60) else: timeout = max(self.timeout, 30) response = self.httpget(self.mirror_url, allow_redirects=True, extra_headers=headers, timeout=timeout) if response.status_code != 200: raise self.UpstreamError("URL %r returned %s %s", self.mirror_url, response.status_code, response.reason) projects = set() baseurl = URL(response.url) basehost = baseurl.replace(path='') for elem in self._iter_remote_project_links(response): href = elem.attrib['href'] if '://' not in href: project = href.rstrip('/').rsplit('/', 1)[-1] else: newurl = baseurl.joinpath(href) # remove trailing slashes, so basename works correctly newurl = newurl.asfile() if not newurl.is_valid_http_url(): continue if not newurl.path.startswith(baseurl.path): continue if basehost != newurl.replace(path=''): continue project = newurl.basename projects.add(project) return projects
def test_replace_nothing(self): url = URL("http://qwe/foo?bar=ham#hash") new_url = url.replace() assert new_url is not url assert new_url.url == url.url
def fetch(self, handler, url): if self.initial_fetch: url = URL(url) if url.query: url = url.replace(query=url.query + '&initial_fetch') else: url = url.replace(query='initial_fetch') url = url.url log = self.log config = self.xom.config log.info("fetching %s", url) uuid, master_uuid = make_uuid_headers(config.nodeinfo) assert uuid != master_uuid try: self.master_contacted_at = time.time() token = self.auth_serializer.dumps(uuid) r = self.session.get(url, auth=self.master_auth, headers={ H_REPLICA_UUID: uuid, H_EXPECTED_MASTER_ID: master_uuid, H_REPLICA_OUTSIDE_URL: config.args.outside_url, str('Authorization'): 'Bearer %s' % token }, timeout=self.REPLICA_REQUEST_TIMEOUT) except Exception as e: msg = ''.join(traceback.format_exception_only(e.__class__, e)).strip() log.error("error fetching %s: %s", url, msg) return False if r.status_code not in (200, 202): log.error("%s %s: failed fetching %s", r.status_code, r.reason, url) return False # we check that the remote instance # has the same UUID we saw last time master_uuid = config.get_master_uuid() remote_master_uuid = r.headers.get(H_MASTER_UUID) if not remote_master_uuid: # we don't fatally leave the process because # it might just be a temporary misconfiguration # for example of a nginx frontend log.error( "remote provides no %r header, running " "<devpi-server-2.1?" " headers were: %s", H_MASTER_UUID, r.headers) self.thread.sleep(self.ERROR_SLEEP) return True if master_uuid and remote_master_uuid != master_uuid: # we got a master_uuid and it is not the one we # expect, we are replicating for -- it's unlikely this heals # itself. It's thus better to die and signal we can't operate. log.error( "FATAL: master UUID %r does not match " "expected master UUID %r. EXITTING.", remote_master_uuid, master_uuid) # force exit of the process os._exit(3) try: remote_serial = int(r.headers["X-DEVPI-SERIAL"]) except Exception as e: msg = ''.join(traceback.format_exception_only(e.__class__, e)).strip() log.error("error fetching %s: %s", url, msg) return False if r.status_code == 200: try: handler(r) except Exception: log.exception("could not process: %s", r.url) else: # we successfully received data so let's # record the master_uuid for future consistency checks if not master_uuid: self.xom.config.set_master_uuid(remote_master_uuid) # also record the current master serial for status info self.update_master_serial(remote_serial) return True elif r.status_code == 202: remote_serial = int(r.headers["X-DEVPI-SERIAL"]) log.debug("%s: trying again %s\n", r.status_code, url) # also record the current master serial for status info self.update_master_serial(remote_serial) return True return False