def abort_if_invalid_filename(name, filename): if not is_valid_archive_name(filename): abort_submit(400, "%r is not a valid archive name" % (filename)) if normalize_name(filename).startswith(normalize_name(name)): return abort_submit( 400, "filename %r does not match project name %r" % (filename, name))
def abort_if_invalid_filename(name, filename): if not is_valid_archive_name(filename): abort_submit(400, "%r is not a valid archive name" %(filename)) if normalize_name(filename).startswith(normalize_name(name)): return abort_submit(400, "filename %r does not match project name %r" %(filename, name))
def process_sub_hits(self, stage, sub_hits, data): search_index = self.request.registry['search_index'] result = [] for sub_hit in sub_hits: sub_data = sub_hit['data'] text_type = sub_data['type'] title = text_type.title() highlight = None if text_type == 'project': continue elif text_type in ('title', 'page'): docs = self.get_docs(stage, data) try: entry = docs[sub_data['text_path']] except KeyError: highlight = ( "Couldn't access documentation files for %s " "version %s on %s. This is a bug. If you find a way " "to reproduce this, please file an issue at: " "https://github.com/devpi/devpi/issues" % (data['name'], data['doc_version'], stage.name)) else: text = entry['text'] highlight = search_index.highlight(text, sub_hit.get('words')) title = sub_data.get('text_title', title) text_path = sub_data.get('text_path') if text_path: sub_hit['url'] = self.request.route_url( "docviewroot", user=data['user'], index=data['index'], project=normalize_name(data['name']), version=data['doc_version'], relpath="%s.html" % text_path) elif text_type in ('keywords', 'description', 'summary'): metadata = self.get_versiondata(stage, data) if metadata is None: continue text = metadata.get(text_type) if text is None: continue highlight = search_index.highlight(text, sub_hit.get('words')) if 'version' in data: sub_hit['url'] = self.request.route_url( "/{user}/{index}/{project}/{version}", user=data['user'], index=data['index'], project=normalize_name(data['name']), version=data['version'], _anchor=text_type) else: log.error("Unknown type %s" % text_type) continue sub_hit['title'] = title sub_hit['highlight'] = highlight result.append(sub_hit) return result
def get_project_info_perstage(self, name): """ return normalized name for the given name or None if no project exists. """ assert py.builtin._istext(name) names = self.getprojectnames_perstage() norm2name = dict([(normalize_name(x), x) for x in names]) realname = norm2name.get(normalize_name(name), None) if realname: return ProjectInfo(self, realname)
def get_projectname_perstage(self, name): """ return existing projectname for the given name which may be in a non-canonical form. """ assert py.builtin._istext(name) names = self.list_projectnames_perstage() if name in names: return name normname = normalize_name(name) for projectname in names: if normalize_name(projectname) == normname: return projectname
def process_sub_hits(self, stage, sub_hits, data): search_index = self.request.registry['search_index'] result = [] for sub_hit in sub_hits: sub_data = sub_hit['data'] text_type = sub_data['type'] title = text_type.title() highlight = None if text_type == 'project': continue elif text_type in ('title', 'page'): docs = self.get_docs(stage, data) entry = docs[sub_data['text_path']] text = entry['text'] highlight = search_index.highlight(text, sub_hit.get('words')) title = sub_data.get('text_title', title) text_path = sub_data.get('text_path') if text_path: sub_hit['url'] = self.request.route_url( "docviewroot", user=data['user'], index=data['index'], project=normalize_name(data['name']), version=data['doc_version'], relpath="%s.html" % text_path) elif text_type in ('keywords', 'description', 'summary'): metadata = self.get_versiondata(stage, data) if metadata is None: continue text = metadata.get(text_type) if text is None: continue highlight = search_index.highlight(text, sub_hit.get('words')) if 'version' in data: sub_hit['url'] = self.request.route_url( "/{user}/{index}/{project}/{version}", user=data['user'], index=data['index'], project=normalize_name(data['name']), version=data['version'], _anchor=text_type) else: log.error("Unknown type %s" % text_type) continue sub_hit['title'] = title sub_hit['highlight'] = highlight result.append(sub_hit) return result
def preprocess_project(stage, name_input): name = normalize_name(name_input) try: user = stage.user.name index = stage.index except AttributeError: user, index = stage.name.split('/') if not is_project_cached(stage, name): return dict(name=name, user=user, index=index) setuptools_metadata = frozenset(( 'author', 'author_email', 'classifiers', 'description', 'download_url', 'home_page', 'keywords', 'license', 'platform', 'summary')) versions = get_sorted_versions(stage.list_versions_perstage(name)) result = dict(name=name) for i, version in enumerate(versions): if i == 0: verdata = stage.get_versiondata_perstage(name, version) result.update(verdata) links = stage.get_linkstore_perstage(name, version).get_links(rel="doczip") if links: # we assume it has been unpacked result['doc_version'] = version result['+doczip'] = Docs(stage, name, version) break else: assert '+doczip' not in result result[u'user'] = user result[u'index'] = index for key in setuptools_metadata: if key in result: value = result[key] if value == 'UNKNOWN' or not value: del result[key] return result
def dump(self): if self.stage.ixconfig["type"] == "mirror": projects = [] else: self.indexmeta["projects"] = {} self.indexmeta["files"] = [] projects = self.stage.list_projects_perstage() for name in projects: data = {} versions = self.stage.list_versions_perstage(name) for version in versions: v = self.stage.get_versiondata_perstage(name, version) data[version] = get_mutable_deepcopy(v) for val in data.values(): val.pop("+elinks", None) norm_name = normalize_name(name) assert norm_name not in self.indexmeta["projects"] self.indexmeta["projects"][norm_name] = data for version in data: vername = data[version]["name"] linkstore = self.stage.get_linkstore_perstage(vername, version) self.basedir.ensure(dir=1) self.dump_releasefiles(linkstore) self.dump_toxresults(linkstore) entry = self.stage.get_doczip_entry(vername, version) if entry: self.dump_docfile(vername, version, entry) self.exporter.completed("index %r" % self.stage.name)
def store_releasefile(self, project, version, filename, content, last_modified=None): project = normalize_name(project) filename = ensure_unicode(filename) if not self.get_versiondata_perstage(project, version): # There's a chance the version was guessed from the # filename, which might have swapped dashes to underscores if '_' in version: version = version.replace('_', '-') if not self.get_versiondata_perstage(project, version): raise MissesRegistration("%s-%s", project, version) else: raise MissesRegistration("%s-%s", project, version) linkstore = self.get_linkstore_perstage(project, version, readonly=False) link = linkstore.create_linked_entry(rel="releasefile", basename=filename, file_content=content, last_modified=last_modified) self._regen_simplelinks(project) return link
def init_pypi_mirror(self, proxy): """ initialize pypi mirror if no mirror state exists. """ self.proxy = proxy name2serials = self.keyfs.PYPISERIALS.get({}) if not name2serials: log.info("retrieving initial name/serial list") name2serials = proxy.list_packages_with_serial() if name2serials is None: from devpi_server.main import fatal fatal("mirror initialization failed: " "pypi.python.org not reachable") self.keyfs.PYPISERIALS.set(name2serials) else: log.info("reusing already cached name/serial list") # normalize to unicode->serial mapping for name in list(name2serials): if not py.builtin._istext(name): val = name2serials.pop(name) name2serials[py.builtin._totext(name, "utf-8")] = val self.name2serials = name2serials # create a mapping of normalized name to real name self.normname2name = d = dict() for name in name2serials: norm = normalize_name(name) if norm != name: d[norm] = name
def parse_index(self, disturl, html, scrape=True): p = HTMLPage(html, disturl.url) seen = set() for link in p.links: newurl = URL(link.url) if not newurl.is_valid_http_url(): continue eggfragment = newurl.eggfragment if scrape and eggfragment: if normalize_name(eggfragment).startswith(self.projectname): # XXX seems we have to maintain a particular # order to keep pip/easy_install happy with some # packages (e.g. nose) if newurl not in self.egglinks: self.egglinks.insert(0, newurl) else: log.debug("skip egg link %s (projectname: %s)", newurl, self.projectname) continue if is_archive_of_project(newurl, self.projectname): if not newurl.is_valid_http_url(): log.warn("unparseable/unsupported url: %r", newurl) else: seen.add(newurl.url) self._mergelink_ifbetter(newurl) continue if scrape: for link in p.rel_links(): if link.url not in seen: disturl = URL(link.url) if disturl.is_valid_http_url(): self.crawllinks.add(disturl)
def get_releaselinks_perstage(self, project): # compatibility access method for devpi-findlinks and possibly other plugins project = normalize_name(project) return [ self._make_elink(project, key, href) for key, href in self.get_simplelinks_perstage(project) ]
def get_releaselinks(self, project): # compatibility access method used by devpi-web and tests project = normalize_name(project) return [ self._make_elink(project, key, href) for key, href in self.get_simplelinks(project) ]
def _set_versiondata(self, metadata): project = normalize_name(metadata["name"]) version = metadata["version"] key_projversion = self.key_projversion(project, version) versiondata = key_projversion.get(readonly=False) if not key_projversion.is_dirty(): # check if something really changed to prevent # unneccessary changes on db/replica level for key, val in metadata.items(): if val != versiondata.get(key): break else: threadlog.info("not re-registering same metadata for %s-%s", project, version) return versiondata.update(metadata) key_projversion.set(versiondata) threadlog.info("set_metadata %s-%s", project, version) versions = self.key_projversions(project).get(readonly=False) if version not in versions: versions.add(version) self.key_projversions(project).set(versions) projects = self.key_projects.get(readonly=False) if project not in projects: projects.add(project) self.key_projects.set(projects)
def _save_cache_links(self, project, links, requires_python, yanked, serial): assert links != () # we don't store the old "Not Found" marker anymore assert isinstance(serial, int) assert project == normalize_name(project), project data = { "serial": serial, "links": links, "requires_python": requires_python, "yanked": yanked } key = self.key_projsimplelinks(project) old = key.get() if old != data: threadlog.debug("saving changed simplelinks for %s: %s", project, data) key.set(data) # maintain list of currently cached project names to enable # deletion and offline mode self.add_project_name(project) # XXX if the transaction fails the links are still marked # as refreshed but the data was not persisted. It's a rare # enough event (tm) to not worry too much, though. # (we can, however, easily add a # keyfs.tx.on_commit_success(callback) method. self.cache_retrieve_times.refresh(project)
def set_project_serial(self, name, serial): """ set the current serial and fill normalization table. """ self.name2serials[name] = serial n = normalize_name(name) if n != name: self.normname2name[n] = name return n
def on_changed_file_entry(self, ev): """ when a file entry is modified. """ params = ev.typedkey.params user = params.get("user") index = params.get("index") keyfs = self.xom.keyfs with keyfs.transaction(at_serial=ev.at_serial): stage = self.xom.model.getstage(user, index) if stage is not None and stage.ixconfig["type"] == "mirror": return # we don't trigger on file changes of pypi mirror entry = FileEntry(self.xom, ev.typedkey, meta=ev.value) if not entry.project or not entry.version: # the entry was deleted self.xom.config.hook.devpiserver_on_remove_file( stage=stage, relpath=ev.typedkey.relpath) return name = entry.project assert name == normalize_name(name) linkstore = stage.get_linkstore_perstage(name, entry.version) links = linkstore.get_links(basename=entry.basename) if len(links) == 1: self.xom.config.hook.devpiserver_on_upload( stage=stage, project=name, version=entry.version, link=links[0])
def store_doczip(self, project, version, content): project = normalize_name(project) if not version: version = self.get_latest_version_perstage(project) if not version: raise MissesVersion( "doczip has no version and '%s' has no releases to " "derive one from", project) threadlog.info("store_doczip: derived version of %s is %s", project, version) basename = "%s-%s.doc.zip" % (project, version) verdata = self.get_versiondata_perstage(project, version, readonly=False) if not verdata: self.set_versiondata({'name': project, 'version': version}) linkstore = self.get_linkstore_perstage(project, version, readonly=False) link = linkstore.create_linked_entry( rel="doczip", basename=basename, file_content=content, ) return link
def op_sro_check_mirror_whitelist(self, opname, **kw): project = normalize_name(kw["project"]) whitelisted = private_hit = False for stage in self.sro(): if stage.ixconfig["type"] == "mirror": if private_hit: if not whitelisted: threadlog.debug( "%s: private package %r not whitelisted, " "ignoring %s", opname, project, stage.name) continue threadlog.debug( "private package %r whitelisted at stage %s", project, whitelisted.name) else: whitelist = set(stage.ixconfig["mirror_whitelist"]) if '*' in whitelist or project in whitelist: whitelisted = stage elif stage.has_project_perstage(project): private_hit = True try: res = getattr(stage, opname)(**kw) private_hit = private_hit or res yield stage, res except UpstreamError as exc: # If we are currently checking ourself raise the error, it is fatal if stage is self: raise threadlog.warn( 'Failed to check mirror whitelist. Assume it does not exists (%s)', exc)
def _regen_simplelinks(self, project_input): project = normalize_name(project_input) links = [] for version in self.list_versions_perstage(project): linkstore = self.get_linkstore_perstage(project, version) links.extend( map(make_key_and_href, linkstore.get_links("releasefile"))) self.key_projsimplelinks(project).set({"links": links})
def _dump_project_cache(self, projectname, dumplist, serial): normname = normalize_name(projectname) data = { "serial": serial, "entrylist": dumplist, "projectname": projectname } self.keyfs.PYPILINKS(name=normname).set(data)
def get_releaselinks(self, project): # compatibility access method used by devpi-web and tests project = normalize_name(project) try: return [self._make_elink(project, key, href, require_python) for key, href, require_python in self.get_simplelinks(project)] except self.UpstreamNotFoundError: return []
def get_unpack_path(stage, name, version): path = stage.xom.config.args.documentation_path if path is None: path = stage.keyfs.basedir else: path = py.path.local(path) return path.join(stage.user.name, stage.index, normalize_name(name), version, "+doc")
def iter_projects_normalized(self, projects): project_name_map = {} for project in projects: project_name_map.setdefault(normalize_name(project), set()).add(project) for project, names in project_name_map.items(): versions = {} for name in names: versions.update(projects[name]) yield (project, versions)
def del_project(self, project): project = normalize_name(project) for version in list(self.key_projversions(project).get()): self.del_versiondata(project, version, cleanup=False) self._regen_simplelinks(project) with self.key_projects.update() as projects: projects.remove(project) threadlog.info("deleting project %s", project) self.key_projversions(project).delete()
def __init__(self, stage, project, version, readonly=True): self.stage = stage self.filestore = stage.filestore self.project = normalize_name(project) self.version = version self.verdata = stage.get_versiondata_perstage(self.project, version, readonly=readonly) if not self.verdata: raise MissesRegistration("%s-%s on stage %s", project, version, stage.name)
def _set_project_serial(self, name, serial): """ set the current serial and fill normalization table if project does not exist. """ if name in self.name2serials: self.name2serials[name] = serial else: self.name2serials[name] = serial n = normalize_name(name) if n != name: self.normname2name[n] = name
def init_pypi_mirror(self, proxy): """ initialize pypi mirror if no mirror state exists. """ self.name2serials = self.load_name2serials(proxy) # create a mapping of normalized name to real name self.normname2name = d = dict() for name in self.name2serials: norm = normalize_name(name) assert py.builtin._istext(norm) assert py.builtin._istext(name) if norm != name: d[norm] = name
def _dump_project_cache(self, projectname, entries, serial): normname = normalize_name(projectname) dumplist = [(entry.relpath, entry.md5, entry.eggfragment) for entry in entries] data = {"serial": serial, "latest_serial": serial, "entrylist": dumplist, "projectname": projectname} threadlog.debug("saving data for %s: %s", projectname, data) self.keyfs.PYPILINKS(name=normname).set(data) return list(self._make_elinks(projectname, data["entrylist"]))
def _dump_project_cache(self, projectname, entries, serial): normname = normalize_name(projectname) dumplist = [(entry.relpath, entry.hash_spec, entry.eggfragment) for entry in entries] data = {"serial": serial, "latest_serial": serial, "entrylist": dumplist, "projectname": projectname} threadlog.debug("saving data for %s: %s", projectname, data) self.keyfs.PYPILINKS(name=normname).set(data) return list(self._make_elinks(projectname, data["entrylist"]))
def result(self): result = self.search_result if not result or not result['items']: return items = [] for item in result['items']: data = item['data'] stage = self.get_stage(data['path']) if stage is None: continue if 'version' in data: item['url'] = self.request.route_url( "/{user}/{index}/{project}/{version}", user=data['user'], index=data['index'], project=normalize_name(data['name']), version=data['version']) item['title'] = "%s-%s" % (data['name'], data['version']) else: item['url'] = self.request.route_url( "/{user}/{index}/{project}", user=data['user'], index=data['index'], project=normalize_name(data['name'])) item['title'] = data['name'] item['sub_hits'] = self.process_sub_hits(stage, item['sub_hits'], data) more_results = result['info']['collapsed_counts'][data['path']] if more_results: new_params = dict(self.params) new_params['query'] = "%s path:%s" % (self.params['query'], data['path']) item['more_url'] = self.request.route_url('search', _query=new_params) item['more_count'] = more_results items.append(item) if not items: return result['items'] = items return result
def get_docs_info(request, stage, metadata): if stage.ixconfig['type'] == 'mirror': return name, ver = normalize_name(metadata["name"]), metadata["version"] doc_path = get_unpack_path(stage, name, ver) if doc_path.exists(): return dict(title="%s-%s" % (name, ver), url=request.route_url("docviewroot", user=stage.user.name, index=stage.index, project=name, version=ver, relpath="index.html"))
def compute_global_projectname_normalization(self): self.tw.line("computing global projectname normalization map") norm2maxversion = {} # compute latest normname version across all stages for user in self.xom.model.get_userlist(): userconfig = user.get() for indexname in userconfig.get("indexes", []): stage = self.xom.model.getstage(user.name, indexname) names = stage.list_projectnames_perstage() for name in names: # pypi names take precedence for defining the realname if stage.name == "root/pypi": version = Version("999999.99999") version.realname = name norm2maxversion[normalize_name(name)] = version continue versions = stage.list_versions_perstage(name) if versions: maxver = None for ver in versions: version = Version(ver) verdata = stage.get_versiondata(name, ver) version.realname = verdata.get("name", name) if maxver is None or version > maxver: maxver = version if not maxver: continue norm = normalize_name(name) normver = norm2maxversion.setdefault(norm, maxver) if maxver > normver: norm2maxversion[norm] = maxver # determine real name of a project self.norm2name = norm2name = {} for norm, maxver in norm2maxversion.items(): norm2name[norm] = maxver.realname
def filtered_list_project(self): request = self.request abort_if_invalid_project(request, request.matchdict["project"]) project = self.context.project # we only serve absolute links so we don't care about the route's slash stage = self.context.stage releasefilter = get_release_filter(stage).get(project) if releasefilter is None: abort(self.request, 404, "The project %s does not exist." %(project)) try: links = stage.get_simplelinks(project, sorted_links=False) except stage.UpstreamError as e: threadlog.error(e.msg) abort(request, 502, e.msg) result = [] for key, url in links: parts = splitext_archive(key)[0].split('-') for index in range(1, len(parts)): name = normalize_name('-'.join(parts[:index])) if name == project: version = '-'.join(parts[index:]) break else: continue if version in releasefilter: result.append((key, url)) if not result: self.request.context.verified_project # access will trigger 404 if not found # we don't need the extra stuff on the simple page for pip embed_form = False blocked_index = None response = Response(body=b"".join(self._simple_list_project( stage, project, result, embed_form, blocked_index))) if stage.ixconfig['type'] == 'mirror': serial = stage.key_projsimplelinks(project).get().get("serial") if serial > 0: response.headers[str("X-PYPI-LAST-SERIAL")] = str(serial) return response
def set_project_serial(self, name, serial): """ set the current serial and update projectname normalization table. Usually ``name`` is a "realname" not a normalized name. But you can pass in a normalized name if the project is already known in which case we derive the real name automatically. """ n = normalize_name(name) if n in self.normname2name: name = self.normname2name[n] if serial is None: del self.name2serials[name] self.normname2name.pop(n, None) else: self.name2serials[name] = serial if n != name: self.normname2name[n] = name return n
def dump(self): import copy for name in self.stage.list_projectnames_perstage(): data = {} versions = self.stage.list_versions_perstage(name) for version in versions: data[version] = copy.deepcopy( self.stage.get_versiondata_perstage(name, version)) for val in data.values(): val.pop("+elinks", None) norm_name = normalize_name(name) assert norm_name not in self.indexmeta["projects"] self.indexmeta["projects"][norm_name] = data for version in data: vername = data[version]["name"] linkstore = self.stage.get_linkstore_perstage(vername, version) self.dump_releasefiles(linkstore) self.dump_toxresults(linkstore) entry = self.stage.get_doczip_entry(vername, version) if entry: self.dump_docfile(vername, version, entry) self.exporter.completed("index %r" % self.stage.name)
def clear_cache(self, projectname): normname = normalize_name(projectname) # we have to set to an empty dict instead of removing the key, so # replicas behave correctly self.keyfs.PYPILINKS(name=normname).set({}) threadlog.debug("cleared cache for %s", projectname)
def get_releaselinks_perstage(self, projectname): """ return all releaselinks from the index and referenced scrape pages, returning cached entries if we have a recent enough request stored locally. Raise UpstreamError if the pypi server cannot be reached or does not return a fresh enough page although we know it must exist. """ projectname = self.get_projectname_perstage(projectname) if projectname is None: return [] is_fresh, links = self._load_cache_links(projectname) if links is not None and is_fresh: return links # get the simple page for the project url = self.PYPIURL_SIMPLE + projectname + "/" threadlog.debug("visiting index %s", url) response = self.httpget(url, allow_redirects=True) if response.status_code != 200: # if we have an old version, return it instead of erroring out if links is not None: threadlog.error("serving stale links for %r, upstream not reachable", projectname) return links # XXX it's not correct to return UpstreamError in all cases # if indeed the project was deleted but that fact # is not yet properly processed raise self.UpstreamError("%s status on GET %s" % (response.status_code, url)) if self.xom.is_replica(): # XXX this code path is not currently tested, handle with care! # we have already triggered the master above # and now need to wait until the parsed new links are # transferred back to the replica devpi_serial = int(response.headers["X-DEVPI-SERIAL"]) self.keyfs.notifier.wait_tx_serial(devpi_serial) # XXX raise TransactionRestart to get a consistent clean view self.keyfs.commit_transaction_in_thread() self.keyfs.begin_transaction_in_thread() is_fresh, links = self._load_cache_links(projectname) if links is not None: return links raise self.UpstreamError("no cache links from master for %s" % projectname) # check that we got a fresh enough page serial = int(response.headers["X-PYPI-LAST-SERIAL"]) newest_serial = self.pypimirror.name2serials.get(projectname, -1) if serial < newest_serial: raise self.UpstreamError( "%s: pypi returned serial %s, expected %s", projectname, serial, newest_serial) threadlog.debug("%s: got response with serial %s" % (projectname, serial)) # check returned url has the same normalized name ret_projectname = response.url.strip("/").split("/")[-1] assert normalize_name(projectname) == normalize_name(ret_projectname) # parse simple index's link and perform crawling assert response.text is not None, response.text result = parse_index(response.url, response.text) perform_crawling(self, result) releaselinks = list(result.releaselinks) self.keyfs.restart_as_write_transaction() # compute release link entries and cache according to serial entries = [self.filestore.maplink(link) for link in releaselinks] return self._dump_project_cache(projectname, entries, serial)
def get_registered_name(self, name): norm_name = normalize_name(name) name = self.normname2name.get(norm_name, norm_name) if name in self.name2serials: return name
def _dump_project_cache(self, projectname, dumplist, serial): normname = normalize_name(projectname) data = {"serial": serial, "entrylist": dumplist, "projectname": projectname} self.keyfs.PYPILINKS(name=normname).set(data)
def get_real_projectname(self, name): norm = normalize_name(name) return self.norm2name[norm]
def _load_project_cache(self, projectname): normname = normalize_name(projectname) return self.keyfs.PYPILINKS(name=normname).get(None)
def get_project_info(self, name): norm_name = normalize_name(name) name = self.normname2name.get(norm_name, norm_name) if name in self.name2serials: return ProjectInfo(self, name)
def __init__(self, projectname): self.projectname = normalize_name(projectname) self.basename2link = {} self.crawllinks = set() self.egglinks = []
def _load_project_cache(self, projectname): normname = normalize_name(projectname) data = self.keyfs.PYPILINKS(name=normname).get() #log.debug("load data for %s: %s", projectname, data) return data
def key_projversion(self, name, version): name = normalize_name(name) return self.keyfs.PROJVERSION( user=self.user.name, index=self.index, name=name, version=version)