def search_levenshtein(cls, q: str, limit: int = 15): """ SELECT * from devices order by LEVENSHTEIN(name, '11') ASC; """ query = cls.query.order_by(asc(func.levenshtein(Device.name, q))).limit(limit) return query
def query(self): tables = self.left.from_clause + self.right.from_clause left_lt = self.config.linktab.alias('__left_linktab') right_lt = self.config.linktab.alias('__right_linktab') tables += [left_lt, right_lt] columns = [] score_length = func.greatest(func.length(self.left.key), func.length(self.right.key)) score_leven = func.levenshtein(self.left.key, self.right.key) score_leven = cast(score_leven, Float) score = 1 - (score_leven / score_length) columns.append(score.label("score")) for field in self.left.fields: columns.append(field.column.label(field.column_ref)) for field in self.right.fields: columns.append(field.column.label(field.column_ref)) q = select(columns=columns, from_obj=tables) q = self.left.apply_filters(q) q = self.right.apply_filters(q) q = q.where(left_lt.c.key == self.left.key) q = q.where(left_lt.c.view == self.left.name) q = q.where(right_lt.c.key == self.right.key) q = q.where(right_lt.c.view == self.right.name) # TODO: make this levenshteinable q = q.where(right_lt.c.fingerprint == left_lt.c.fingerprint) q = q.limit(self.config.cutoff + 1) q = q.order_by(score.desc()) q = q.distinct() # print q return q
def find_matches(dataset, text, filter=None, exclude=None): entities = Entity.__table__ match_text = normalize(text, dataset)[:254] # select text column and apply necesary transformations text_field = entities.c.name if dataset.normalize_text: text_field = entities.c.normalized if dataset.ignore_case: text_field = func.lower(text_field) text_field = func.left(text_field, 254) # calculate the difference percentage l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text)) / l) * 100.0) score = func.max(score).label('score') # coalesce the canonical identifier id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label('id') # apply filters filters = [entities.c.dataset_id==dataset.id, entities.c.invalid==False] if not dataset.match_aliases: filters.append(entities.c.canonical_id==None) if exclude is not None: filters.append(entities.c.id!=exclude) if filter is not None: filters.append(text_field.ilike('%%%s%%' % filter)) q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()]) return Matches(q)
def guess_model(self): from skylines.model import Flight, AircraftModel # first try to find the reg number in the database if self.registration is not None: glider_reg = self.registration result = DBSession.query(Flight) \ .filter(func.upper(Flight.registration) == func.upper(glider_reg)) \ .order_by(desc(Flight.id)) \ .first() if result and result.model_id: return result.model_id # try to find another flight with the same logger and use it's aircraft type if (self.logger_id is not None and self.logger_manufacturer_id is not None): logger_id = self.logger_id logger_manufacturer_id = self.logger_manufacturer_id result = DBSession.query(Flight).outerjoin(IGCFile) \ .filter(func.upper(IGCFile.logger_manufacturer_id) == func.upper(logger_manufacturer_id)) \ .filter(func.upper(IGCFile.logger_id) == func.upper(logger_id)) \ .filter(Flight.model_id == None) \ .order_by(desc(Flight.id)) if self.logger_manufacturer_id.startswith('X'): result = result.filter(Flight.pilot == self.owner) result = result.first() if result and result.model_id: return result.model_id if self.model is not None: glider_type = self.model.lower() # otherwise, try to guess the glider model by the glider type igc header text_fragments = ['%{}%'.format(v) for v in re.sub(r'[^a-z]', ' ', glider_type).split()] digit_fragments = ['%{}%'.format(v) for v in re.sub(r'[^0-9]', ' ', glider_type).split()] if not text_fragments and not digit_fragments: return None glider_type_clean = re.sub(r'[^a-z0-9]', '', glider_type) result = DBSession \ .query(AircraftModel) \ .filter(and_( func.regexp_replace(func.lower(AircraftModel.name), '[^a-z]', ' ').like(func.any(text_fragments)), func.regexp_replace(func.lower(AircraftModel.name), '[^0-9]', ' ').like(func.all(digit_fragments)))) \ .order_by(func.levenshtein(func.regexp_replace(func.lower(AircraftModel.name), '[^a-z0-9]', ''), glider_type_clean)) if result.first(): return result.first().id # nothing found return None
def project_detail(project, request): project_name = request.matchdict["project_name"] if project_name != project.normalized_name: raise HTTPMovedPermanently( request.current_route_path(project_name=project.normalized_name)) releases = (request.db.query(Release).filter( Release.project == project).order_by( Release._pypi_ordering.desc()).limit(10).all()) maintainers = [ role for role in (request.db.query(Role).join(User).filter( Role.project == project).distinct(User.username).all()) ] maintainers = sorted(maintainers, key=lambda x: (x.role_name, x.user.username)) journal = [ entry for entry in (request.db.query(JournalEntry).options( joinedload("submitted_by")).filter( JournalEntry.name == project.name).order_by( JournalEntry.submitted_date.desc(), JournalEntry.id.desc()).limit(30)) ] squattees = (request.db.query(Project).filter( Project.created < project.created).filter( func.levenshtein(Project.normalized_name, project.normalized_name) <= 2).all()) squatters = (request.db.query(Project).filter( Project.created > project.created).filter( func.levenshtein(Project.normalized_name, project.normalized_name) <= 2).all()) return { "project": project, "releases": releases, "maintainers": maintainers, "journal": journal, "squatters": squatters, "squattees": squattees, "ONE_MB": ONE_MB, "MAX_FILESIZE": MAX_FILESIZE, }
def get_city(request): term = request.params['term'] term_ascii = unicodedata.normalize('NFKD', unicode(term)).encode('ascii', 'ignore').lower() results = DBSession.query(*City.az_columns)\ .filter(AdminZone.name % term_ascii)\ .filter(AdminZone.admin_level==ADMIN_LEVEL_CITY)\ .order_by(func.levenshtein(func.lower(AdminZone.name), term_ascii), AdminZone.population.desc())\ .limit(10).all() return {'results': [City.format_city_res(res) for res in results]}
def get_city(request): term = request.params['term'] term_ascii = unicodedata.normalize('NFKD', unicode(term)).encode( 'ascii', 'ignore').lower() results = DBSession.query(*City.az_columns)\ .filter(AdminZone.name % term_ascii)\ .filter(AdminZone.admin_level==ADMIN_LEVEL_CITY)\ .order_by(func.levenshtein(func.lower(AdminZone.name), term_ascii), AdminZone.population.desc())\ .limit(10).all() return {'results': [City.format_city_res(res) for res in results]}
def levenshtein(conn, worda, wordb): """Return the Levenshtein Distance between worda and wordb. Levenshtein distance measures the number of edits to get from one string to another string. Since we're going to the database anyway and since we're counting on this function in the database for sorting, we might as well use that function to provide the result. """ lev = func.levenshtein(worda, wordb) s = select([lev]) result = conn.execute(s) return [row[0] for row in result]
def find_word(conn, word, limit_to=10): """Find the word in the dictionary and find related words if it is not found. This involves a call to the database searching for words with common n-grams and then sorting the results by closeness to the original word (the levenshtein distance). """ word_grams = get_ngrams.get_ngrams(word) lev = func.levenshtein(words.c.spelling, word) s = select([words, lev]).where(and_(ngrams.c.word_id == words.c.id, ngrams.c.ngram.in_(word_grams))).distinct().limit(limit_to).order_by(lev) result = conn.execute(s) return [(row[1], row[2]) for row in result]
def add_or_create_by_transaction(cls, transaction): recurring_transaction = RecurringTransaction.query.join( Transaction).filter( RecurringTransaction.user_id == transaction.user_id, func.levenshtein(Transaction.description, transaction.description) < 10, # nopep8 Transaction.amount == transaction.amount, ).first() if not recurring_transaction: recurring_transaction = RecurringTransaction( user_id=transaction.user_id) recurring_transaction.transactions.append(transaction) recurring_transaction.save() return recurring_transaction
def add_or_create_by_transaction(cls, transaction): recurring_transaction = RecurringTransaction.query.join( Transaction ).filter( RecurringTransaction.user_id == transaction.user_id, func.levenshtein(Transaction.description, transaction.description) < 10, # nopep8 Transaction.amount == transaction.amount, ).first() if not recurring_transaction: recurring_transaction = RecurringTransaction( user_id=transaction.user_id ) recurring_transaction.transactions.append(transaction) recurring_transaction.save() return recurring_transaction
def keywords_to_query_list(subject, keywords, fallback=False): res = [] for kw in keywords: if fallback: if not self.__oi.fuzzy: res.append(or_( func.levenshtein(func.substring(subject, 0, 50), func.substring(kw, 0, 50)) < 3, subject.like("%" + kw.replace(r"%", "\%") + "%") )) else: res.append(subject.like("%" + kw.replace(r"%", "\%") + "%")) else: res.append(subject == kw) return res
def postgres_lookup(input: str) -> List[Whatis]: subquery_base = db_session.query( Whatis.whatis_id, func.max(Whatis.version).label("version") ) subquery_filtered = subquery_base.filter( or_( func.levenshtein(func.lower(Whatis.terminology), input.lower()) <= 1, Whatis.terminology.ilike(f"%{input}%") if len(input) > 3 else false(), ) ) subquery_grouped = subquery_filtered.group_by(Whatis.whatis_id).subquery("s2") query = db_session.query(Whatis).join( subquery_grouped, and_( Whatis.whatis_id == subquery_grouped.c.whatis_id, Whatis.version == subquery_grouped.c.version, ), ) return query.all()
def keywords_to_query_list(subject, keywords, fallback=False): res = [] for kw in keywords: if fallback: if not self.__oi.fuzzy: res.append( or_( func.levenshtein( func.substring(subject, 0, 50), func.substring(kw, 0, 50)) < 3, subject.like("%" + kw.replace(r"%", "\%") + "%"))) else: res.append( subject.like("%" + kw.replace(r"%", "\%") + "%")) else: res.append(subject == kw) return res
def query(query_args): q = query_args.get('q', '') limit = int(query_args.get('limit', 10)) offset = int(query_args.get('offset', 0)) sumlevel = query_args.get('sumlevel', None) excluded = ['050AF0017065246', '050AF0015265268'] focus_countries = [ "040AF00155", "040AF00094", "040AF00253", "040AF00170", "040AF00217", "040AF00042", "040AF00152", "040AF00270", "040AF00257", "040AF00079", "040AF00205", "040AF00182", "040AF00133" ] adm0s = [int(country[-3:]) for country in focus_countries] qry = Geo.query.filter(Geo.name.ilike("%{}%".format(q))) cond = or_(Geo.id.in_(focus_countries), Geo.adm0_id.in_(adm0s)) qry = qry.filter(cond) qry = qry.filter(~Geo.id.in_(excluded)) if sumlevel: qry = qry.filter(Geo.level == sumlevel) qry = qry.order_by(Geo.level, func.levenshtein(Geo.name, q)) qry = qry.limit(limit).offset(offset) return qry.all()
def find_matches(dataset, text, filter=None, exclude=None): entities = Entity.__table__ match_text = (normalize(text) or '')[:254] # select text column and apply necesary transformations text_field = entities.c.name if dataset.normalize_text: text_field = entities.c.normalized if dataset.ignore_case: text_field = func.lower(text_field) text_field = func.left(text_field, 254) # calculate the difference percentage min_l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest( 0.0, ((min_l - func.levenshtein(text_field, match_text)) / min_l) * 100.0) score = func.max(score).label('score') # coalesce the canonical identifier id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label('id') # apply filters filters = [ entities.c.dataset_id == dataset.id, entities.c.invalid == False ] # noqa if not dataset.match_aliases: filters.append(entities.c.canonical_id == None) # noqa if exclude is not None: filters.append(entities.c.id != exclude) if filter is not None: filters.append(text_field.ilike('%%%s%%' % filter)) q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()]) return Matches(q)
def find_matches(project, account, text, schemata=[], properties=[]): main = aliased(Property) ent = aliased(Entity) q = db.session.query(main.entity_id) q = q.filter(main.name == "name") q = q.filter(main.entity_id == ent.id) q = q.join(ent) q = q.filter(ent.project_id == project.id) for schema in schemata: obj = aliased(Schema) q = q.join(obj, ent.schema_id == obj.id) q = q.filter(obj.name == schema) for name, value in properties: p = aliased(Property) q = q.join(p, p.entity_id == ent.id) q = q.filter(p.active == True) # noqa q = q.filter(p.name == name) attr = project.get_attribute("entity", name) column = getattr(p, attr.value_column) q = q.filter(column == value) # prepare text fields (todo: further normalization!) text_field = func.left(func.lower(main.value_string), 254) match_text = text.lower().strip()[:254] match_text_db = cast(match_text, types.Unicode) # calculate the difference percentage l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text_db)) / l) * 100.0) score = score.label("score") q = q.add_columns(score) q = q.order_by(score.desc()) q = q.filter(score > 50) return Matches(q, account)
def find_matches(project, account, text, schemata=[], properties=[]): main = aliased(Property) ent = aliased(Entity) q = db.session.query(main.entity_id) q = q.filter(main.name == 'name') q = q.filter(main.entity_id == ent.id) q = q.join(ent) q = q.filter(ent.project_id == project.id) if len(schemata): obj = aliased(Schema) q = q.join(obj, ent.schema_id == obj.id) q = q.filter(obj.name.in_(schemata)) for name, value in properties: p = aliased(Property) q = q.join(p, p.entity_id == ent.id) q = q.filter(p.active == True) # noqa q = q.filter(p.name == name) column = getattr(p, p.type_column(value)) q = q.filter(column == value) # prepare text fields (todo: further normalization!) text_field = func.left(func.lower(main.value_string), 254) match_text = text.lower().strip()[:254] match_text_db = cast(match_text, types.Unicode) # calculate the difference percentage l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text_db)) / l) * 100.0) score = score.label('score') q = q.group_by(main.entity_id) q = q.add_columns(func.max(score)) q = q.order_by(func.max(score).desc()) q = q.filter(score > 50) return Matches(q, project, account)
def recommendations(manga_name): sametype = request.args.get('sametype') samegenre = request.args.get('samegenre') commonrecs = request.args.get('commonrecs') manga_name = manga_name.replace('_', ' ') session = Session() users = session.query(Manga.name, Manga.recommender).filter(func.lower(Manga.name)==manga_name.lower()).all() if len(users) == 0: first_manga = session.query(Manga.name, func.levenshtein(Manga.name, manga_name,2,1,4)).filter(func.levenshtein(Manga.name, manga_name,2,1,4)<15).order_by(asc(func.levenshtein(Manga.name, manga_name,2,1,4))).first() if first_manga: users = session.query(Manga.name, Manga.recommender).filter(Manga.name==first_manga.name).all() manga_name = users[0].name else: users = [] else: manga_name = users[0].name users = [item.recommender for item in users] common_manga = session.query(Manga.name, Manga.mu_id, Manga.type, Manga.demographic, func.count(Manga.name)).filter(Manga.recommender.in_(users), func.lower(Manga.name) != manga_name.lower()).group_by(Manga.name, Manga.mu_id, Manga.type, Manga.demographic).having(func.count(Manga.name) > 1).order_by(func.random()).all() long_manga = session.query(Manga.name, Manga.mu_id, Manga.type, Manga.demographic, func.count(Manga.name) ).filter(and_(Manga.recommender.in_(users), func.lower(Manga.name) != manga_name.lower())).group_by(Manga.name, Manga.mu_id, Manga.type, Manga.demographic).having(func.count(Manga.name) == 1).order_by(func.random()).all() manga_details = session.query(Manga.type, Manga.demographic).filter(Manga.name == manga_name).first() session.close() type = manga_details[0] demographic = manga_details[1] checked = [ sametype, samegenre, commonrecs ] recs=[] for item in common_manga: recs.append([item.name, item.type, item.demographic, item.mu_id, "common"]) for item in long_manga: recs.append([item.name, item.type, item.demographic, item.mu_id, "long"]) return render_template('recommendations.html', manga_name=manga_name, recs=recs, checked=checked, type=type, demographic=demographic)
def file_upload(request): # If we're in read-only mode, let upload clients know if request.flags.enabled("read-only"): raise _exc_with_message( HTTPForbidden, "Read-only mode: Uploads are temporarily disabled") # Log an attempt to upload metrics = request.find_service(IMetricsService, context=None) metrics.increment("warehouse.upload.attempt") # Before we do anything, if there isn't an authenticated user with this # request, then we'll go ahead and bomb out. if request.authenticated_userid is None: raise _exc_with_message( HTTPForbidden, "Invalid or non-existent authentication information.") # Ensure that user has a verified, primary email address. This should both # reduce the ease of spam account creation and activity, as well as act as # a forcing function for https://github.com/pypa/warehouse/issues/3632. # TODO: Once https://github.com/pypa/warehouse/issues/3632 has been solved, # we might consider a different condition, possibly looking at # User.is_active instead. if not (request.user.primary_email and request.user.primary_email.verified): raise _exc_with_message( HTTPBadRequest, ("User {!r} does not have a verified primary email address. " "Please add a verified primary email before attempting to " "upload to PyPI. See {project_help} for more information." "for more information.").format( request.user.username, project_help=request.help_url(_anchor="verified-email"), ), ) from None # Do some cleanup of the various form fields for key in list(request.POST): value = request.POST.get(key) if isinstance(value, str): # distutils "helpfully" substitutes unknown, but "required" values # with the string "UNKNOWN". This is basically never what anyone # actually wants so we'll just go ahead and delete anything whose # value is UNKNOWN. if value.strip() == "UNKNOWN": del request.POST[key] # Escape NUL characters, which psycopg doesn't like if "\x00" in value: request.POST[key] = value.replace("\x00", "\\x00") # We require protocol_version 1, it's the only supported version however # passing a different version should raise an error. if request.POST.get("protocol_version", "1") != "1": raise _exc_with_message(HTTPBadRequest, "Unknown protocol version.") # Check if any fields were supplied as a tuple and have become a # FieldStorage. The 'content' and 'gpg_signature' fields _should_ be a # FieldStorage, however. # ref: https://github.com/pypa/warehouse/issues/2185 # ref: https://github.com/pypa/warehouse/issues/2491 for field in set(request.POST) - {"content", "gpg_signature"}: values = request.POST.getall(field) if any(isinstance(value, FieldStorage) for value in values): raise _exc_with_message(HTTPBadRequest, f"{field}: Should not be a tuple.") # Look up all of the valid classifiers all_classifiers = request.db.query(Classifier).all() # Validate and process the incoming metadata. form = MetadataForm(request.POST) # Add a validator for deprecated classifiers form.classifiers.validators.append(_no_deprecated_classifiers(request)) form.classifiers.choices = [(c.classifier, c.classifier) for c in all_classifiers] if not form.validate(): for field_name in _error_message_order: if field_name in form.errors: break else: field_name = sorted(form.errors.keys())[0] if field_name in form: field = form[field_name] if field.description and isinstance(field, wtforms.StringField): error_message = ( "{value!r} is an invalid value for {field}. ".format( value=field.data, field=field.description) + "Error: {} ".format(form.errors[field_name][0]) + "See " "https://packaging.python.org/specifications/core-metadata" ) else: error_message = "Invalid value for {field}. Error: {msgs[0]}".format( field=field_name, msgs=form.errors[field_name]) else: error_message = "Error: {}".format(form.errors[field_name][0]) raise _exc_with_message(HTTPBadRequest, error_message) # Ensure that we have file data in the request. if "content" not in request.POST: raise _exc_with_message(HTTPBadRequest, "Upload payload does not have a file.") # Look up the project first before doing anything else, this is so we can # automatically register it if we need to and can check permissions before # going any further. try: project = (request.db.query(Project).filter( Project.normalized_name == func.normalize_pep426_name( form.name.data)).one()) except NoResultFound: # Check for AdminFlag set by a PyPI Administrator disabling new project # registration, reasons for this include Spammers, security # vulnerabilities, or just wanting to be lazy and not worry ;) if request.flags.enabled("disallow-new-project-registration"): raise _exc_with_message( HTTPForbidden, ("New project registration temporarily disabled. " "See {projecthelp} for details").format( projecthelp=request.help_url( _anchor="admin-intervention")), ) from None # Before we create the project, we're going to check our blacklist to # see if this project is even allowed to be registered. If it is not, # then we're going to deny the request to create this project. if request.db.query(exists().where( BlacklistedProject.name == func.normalize_pep426_name( form.name.data))).scalar(): raise _exc_with_message( HTTPBadRequest, ("The name {name!r} isn't allowed. " "See {projecthelp} " "for more information.").format( name=form.name.data, projecthelp=request.help_url(_anchor="project-name"), ), ) from None # Also check for collisions with Python Standard Library modules. if packaging.utils.canonicalize_name( form.name.data) in STDLIB_PROHIBITTED: raise _exc_with_message( HTTPBadRequest, ("The name {name!r} isn't allowed (conflict with Python " "Standard Library module name). See " "{projecthelp} for more information.").format( name=form.name.data, projecthelp=request.help_url(_anchor="project-name"), ), ) from None # The project doesn't exist in our database, so first we'll check for # projects with a similar name squattees = (request.db.query(Project).filter( func.levenshtein(Project.normalized_name, func.normalize_pep426_name(form.name.data)) <= 2). all()) # Next we'll create the project project = Project(name=form.name.data) request.db.add(project) # Now that the project exists, add any squats which it is the squatter for for squattee in squattees: request.db.add(Squat(squatter=project, squattee=squattee)) # Then we'll add a role setting the current user as the "Owner" of the # project. request.db.add( Role(user=request.user, project=project, role_name="Owner")) # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add( JournalEntry( name=project.name, action="create", submitted_by=request.user, submitted_from=request.remote_addr, )) request.db.add( JournalEntry( name=project.name, action="add Owner {}".format(request.user.username), submitted_by=request.user, submitted_from=request.remote_addr, )) # Check that the user has permission to do things to this project, if this # is a new project this will act as a sanity check for the role we just # added above. if not request.has_permission("upload", project): raise _exc_with_message( HTTPForbidden, ("The credential associated with user '{0}' " "isn't allowed to upload to project '{1}'. " "See {2} for more information.").format( request.user.username, project.name, request.help_url(_anchor="project-name"), ), ) # Update name if it differs but is still equivalent. We don't need to check if # they are equivalent when normalized because that's already been done when we # queried for the project. if project.name != form.name.data: project.name = form.name.data # Render our description so we can save from having to render this data every time # we load a project description page. rendered = None if form.description.data: description_content_type = form.description_content_type.data if not description_content_type: description_content_type = "text/x-rst" rendered = readme.render(form.description.data, description_content_type, use_fallback=False) # Uploading should prevent broken rendered descriptions. if rendered is None: if form.description_content_type.data: message = ( "The description failed to render " "for '{description_content_type}'.").format( description_content_type=description_content_type) else: message = ("The description failed to render " "in the default format of reStructuredText.") raise _exc_with_message( HTTPBadRequest, "{message} See {projecthelp} for more information.".format( message=message, projecthelp=request.help_url( _anchor="description-content-type"), ), ) from None try: canonical_version = packaging.utils.canonicalize_version( form.version.data) release = (request.db.query(Release).filter( (Release.project == project) & (Release.canonical_version == canonical_version)).one()) except MultipleResultsFound: # There are multiple releases of this project which have the same # canonical version that were uploaded before we checked for # canonical version equivalence, so return the exact match instead release = (request.db.query( Release).filter((Release.project == project) & (Release.version == form.version.data)).one()) except NoResultFound: release = Release( project=project, _classifiers=[ c for c in all_classifiers if c.classifier in form.classifiers.data ], dependencies=list( _construct_dependencies( form, { "requires": DependencyKind.requires, "provides": DependencyKind.provides, "obsoletes": DependencyKind.obsoletes, "requires_dist": DependencyKind.requires_dist, "provides_dist": DependencyKind.provides_dist, "obsoletes_dist": DependencyKind.obsoletes_dist, "requires_external": DependencyKind.requires_external, "project_urls": DependencyKind.project_url, }, )), canonical_version=canonical_version, description=Description( content_type=form.description_content_type.data, raw=form.description.data or "", html=rendered or "", rendered_by=readme.renderer_version(), ), **{ k: getattr(form, k).data for k in { # This is a list of all the fields in the form that we # should pull off and insert into our new release. "version", "summary", "license", "author", "author_email", "maintainer", "maintainer_email", "keywords", "platform", "home_page", "download_url", "requires_python", } }, uploader=request.user, uploaded_via=request.user_agent, ) request.db.add(release) # TODO: This should be handled by some sort of database trigger or # a SQLAlchemy hook or the like instead of doing it inline in # this view. request.db.add( JournalEntry( name=release.project.name, version=release.version, action="new release", submitted_by=request.user, submitted_from=request.remote_addr, )) # TODO: We need a better solution to this than to just do it inline inside # this method. Ideally the version field would just be sortable, but # at least this should be some sort of hook or trigger. releases = (request.db.query(Release).filter( Release.project == project).options( orm.load_only(Release._pypi_ordering)).all()) for i, r in enumerate( sorted(releases, key=lambda x: packaging.version.parse(x.version))): r._pypi_ordering = i # Pull the filename out of our POST data. filename = request.POST["content"].filename # Make sure that the filename does not contain any path separators. if "/" in filename or "\\" in filename: raise _exc_with_message( HTTPBadRequest, "Cannot upload a file with '/' or '\\' in the name.") # Make sure the filename ends with an allowed extension. if _dist_file_regexes[project.allow_legacy_files].search(filename) is None: raise _exc_with_message( HTTPBadRequest, "Invalid file extension: Use .egg, .tar.gz, .whl or .zip " "extension. (https://www.python.org/dev/peps/pep-0527)", ) # Make sure that our filename matches the project that it is being uploaded # to. prefix = pkg_resources.safe_name(project.name).lower() if not pkg_resources.safe_name(filename).lower().startswith(prefix): raise _exc_with_message( HTTPBadRequest, "Start filename for {!r} with {!r}.".format(project.name, prefix), ) # Check the content type of what is being uploaded if not request.POST["content"].type or request.POST[ "content"].type.startswith("image/"): raise _exc_with_message(HTTPBadRequest, "Invalid distribution file.") # Ensure that the package filetype is allowed. # TODO: Once PEP 527 is completely implemented we should be able to delete # this and just move it into the form itself. if not project.allow_legacy_files and form.filetype.data not in { "sdist", "bdist_wheel", "bdist_egg", }: raise _exc_with_message(HTTPBadRequest, "Unknown type of file.") # The project may or may not have a file size specified on the project, if # it does then it may or may not be smaller or larger than our global file # size limits. file_size_limit = max(filter(None, [MAX_FILESIZE, project.upload_limit])) with tempfile.TemporaryDirectory() as tmpdir: temporary_filename = os.path.join(tmpdir, filename) # Buffer the entire file onto disk, checking the hash of the file as we # go along. with open(temporary_filename, "wb") as fp: file_size = 0 file_hashes = { "md5": hashlib.md5(), "sha256": hashlib.sha256(), "blake2_256": hashlib.blake2b(digest_size=256 // 8), } for chunk in iter(lambda: request.POST["content"].file.read(8096), b""): file_size += len(chunk) if file_size > file_size_limit: raise _exc_with_message( HTTPBadRequest, "File too large. " + "Limit for project {name!r} is {limit} MB. ".format( name=project.name, limit=file_size_limit // (1024 * 1024)) + "See " + request.help_url(_anchor="file-size-limit"), ) fp.write(chunk) for hasher in file_hashes.values(): hasher.update(chunk) # Take our hash functions and compute the final hashes for them now. file_hashes = { k: h.hexdigest().lower() for k, h in file_hashes.items() } # Actually verify the digests that we've gotten. We're going to use # hmac.compare_digest even though we probably don't actually need to # because it's better safe than sorry. In the case of multiple digests # we expect them all to be given. if not all([ hmac.compare_digest( getattr(form, "{}_digest".format(digest_name)).data.lower(), digest_value, ) for digest_name, digest_value in file_hashes.items() if getattr(form, "{}_digest".format(digest_name)).data ]): raise _exc_with_message( HTTPBadRequest, "The digest supplied does not match a digest calculated " "from the uploaded file.", ) # Check to see if the file that was uploaded exists already or not. is_duplicate = _is_duplicate_file(request.db, filename, file_hashes) if is_duplicate: return Response() elif is_duplicate is not None: raise _exc_with_message( HTTPBadRequest, # Note: Changing this error message to something that doesn't # start with "File already exists" will break the # --skip-existing functionality in twine # ref: https://github.com/pypa/warehouse/issues/3482 # ref: https://github.com/pypa/twine/issues/332 "File already exists. See " + request.help_url(_anchor="file-name-reuse"), ) # Check to see if the file that was uploaded exists in our filename log if request.db.query( request.db.query(Filename).filter( Filename.filename == filename).exists()).scalar(): raise _exc_with_message( HTTPBadRequest, "This filename has already been used, use a " "different version. " "See " + request.help_url(_anchor="file-name-reuse"), ) # Check to see if uploading this file would create a duplicate sdist # for the current release. if (form.filetype.data == "sdist" and request.db.query( request.db.query(File).filter((File.release == release) & ( File.packagetype == "sdist")).exists()).scalar()): raise _exc_with_message( HTTPBadRequest, "Only one sdist may be uploaded per release.") # Check the file to make sure it is a valid distribution file. if not _is_valid_dist_file(temporary_filename, form.filetype.data): raise _exc_with_message(HTTPBadRequest, "Invalid distribution file.") # Check that if it's a binary wheel, it's on a supported platform if filename.endswith(".whl"): wheel_info = _wheel_file_re.match(filename) plats = wheel_info.group("plat").split(".") for plat in plats: if not _valid_platform_tag(plat): raise _exc_with_message( HTTPBadRequest, "Binary wheel '{filename}' has an unsupported " "platform tag '{plat}'.".format(filename=filename, plat=plat), ) # Also buffer the entire signature file to disk. if "gpg_signature" in request.POST: has_signature = True with open(os.path.join(tmpdir, filename + ".asc"), "wb") as fp: signature_size = 0 for chunk in iter( lambda: request.POST["gpg_signature"].file.read(8096), b""): signature_size += len(chunk) if signature_size > MAX_SIGSIZE: raise _exc_with_message(HTTPBadRequest, "Signature too large.") fp.write(chunk) # Check whether signature is ASCII armored with open(os.path.join(tmpdir, filename + ".asc"), "rb") as fp: if not fp.read().startswith(b"-----BEGIN PGP SIGNATURE-----"): raise _exc_with_message( HTTPBadRequest, "PGP signature isn't ASCII armored.") else: has_signature = False # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add(Filename(filename=filename)) # Store the information about the file in the database. file_ = File( release=release, filename=filename, python_version=form.pyversion.data, packagetype=form.filetype.data, comment_text=form.comment.data, size=file_size, has_signature=bool(has_signature), md5_digest=file_hashes["md5"], sha256_digest=file_hashes["sha256"], blake2_256_digest=file_hashes["blake2_256"], # Figure out what our filepath is going to be, we're going to use a # directory structure based on the hash of the file contents. This # will ensure that the contents of the file cannot change without # it also changing the path that the file is saved too. path="/".join([ file_hashes[PATH_HASHER][:2], file_hashes[PATH_HASHER][2:4], file_hashes[PATH_HASHER][4:], filename, ]), uploaded_via=request.user_agent, ) request.db.add(file_) # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add( JournalEntry( name=release.project.name, version=release.version, action="add {python_version} file {filename}".format( python_version=file_.python_version, filename=file_.filename), submitted_by=request.user, submitted_from=request.remote_addr, )) # TODO: We need a better answer about how to make this transactional so # this won't take affect until after a commit has happened, for # now we'll just ignore it and save it before the transaction is # committed. storage = request.find_service(IFileStorage) storage.store( file_.path, os.path.join(tmpdir, filename), meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, "package-type": file_.packagetype, "python-version": file_.python_version, }, ) if has_signature: storage.store( file_.pgp_path, os.path.join(tmpdir, filename + ".asc"), meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, "package-type": file_.packagetype, "python-version": file_.python_version, }, ) # Log a successful upload metrics.increment("warehouse.upload.ok", tags=[f"filetype:{form.filetype.data}"]) return Response()
def project_detail(project, request): project_name = request.matchdict["project_name"] if project_name != project.normalized_name: raise HTTPMovedPermanently( request.current_route_path(project_name=project.normalized_name) ) releases = ( request.db.query(Release) .filter(Release.project == project) .order_by(Release._pypi_ordering.desc()) .limit(10) .all() ) maintainers = [ role for role in ( request.db.query(Role) .join(User) .filter(Role.project == project) .distinct(User.username) .all() ) ] maintainers = sorted(maintainers, key=lambda x: (x.role_name, x.user.username)) journal = [ entry for entry in ( request.db.query(JournalEntry) .options(joinedload("submitted_by")) .filter(JournalEntry.name == project.name) .order_by(JournalEntry.submitted_date.desc(), JournalEntry.id.desc()) .limit(30) ) ] squattees = ( request.db.query(Project) .filter(Project.created < project.created) .filter(func.levenshtein(Project.normalized_name, project.normalized_name) <= 2) .all() ) squatters = ( request.db.query(Project) .filter(Project.created > project.created) .filter(func.levenshtein(Project.normalized_name, project.normalized_name) <= 2) .all() ) return { "project": project, "releases": releases, "maintainers": maintainers, "journal": journal, "squatters": squatters, "squattees": squattees, "ONE_MB": ONE_MB, "MAX_FILESIZE": MAX_FILESIZE, }
def serve(self): # Configure database for the index Base.metadata.create_all(self.env.getDatabaseEngine("backend-database")) # Store DB session self.__session = self.env.getDatabaseSession("backend-database") # Do a feature check try: self.__session.query(KeyValueIndex).filter(func.levenshtein("foo", "foo") < 2).one_or_none() self.fuzzy = True except: self.__session.rollback() # If there is already a collection, check if there is a newer schema available schema = self.factory.getXMLObjectSchema(True) if self.isSchemaUpdated(schema): self.__session.query(Schema).delete() self.__session.query(KeyValueIndex).delete() self.__session.query(ExtensionIndex).delete() self.__session.query(ObjectInfoIndex).delete() self.log.info('object definitions changed, dropped old object index') # Create the initial schema information if required if not self.__session.query(Schema).one_or_none(): self.log.info('created schema') md5s = hashlib.md5() md5s.update(schema) md5sum = md5s.hexdigest() schema = Schema(hash=md5sum) self.__session.add(schema) self.__session.commit() # Schedule index sync if self.env.config.get("backend.index", "True").lower() == "true": import sys if hasattr(sys, '_called_from_test'): self.sync_index() else: sobj = PluginRegistry.getInstance("SchedulerService") sobj.getScheduler().add_date_job(self.sync_index, datetime.datetime.now() + datetime.timedelta(seconds=1), tag='_internal', jobstore='ram') # Extract search aid attrs = {} mapping = {} resolve = {} aliases = {} for otype in self.factory.getObjectTypes(): # Assemble search aid item = self.factory.getObjectSearchAid(otype) if not item: continue typ = item['type'] aliases[typ] = [typ] if not typ in attrs: attrs[typ] = [] if not typ in resolve: resolve[typ] = [] if not typ in mapping: mapping[typ] = dict(dn="dn", title="title", description="description", icon=None) attrs[typ] += item['search'] if 'keyword' in item: aliases[typ] += item['keyword'] if 'map' in item: mapping[typ].update(item['map']) if 'resolve' in item: resolve[typ] += item['resolve'] # Add index for attribute used for filtering and memorize # attributes potentially needed for queries. tmp = [x for x in attrs.values()] used_attrs = list(itertools.chain.from_iterable(tmp)) used_attrs += list(itertools.chain.from_iterable([x.values() for x in mapping.values()])) used_attrs += list(set(itertools.chain.from_iterable([[x[0]['filter'], x[0]['attribute']] for x in resolve.values() if len(x)]))) used_attrs = list(set(used_attrs)) # Remove potentially not assigned values used_attrs = [u for u in used_attrs if u] # Memorize search information for later use self.__search_aid = dict(attrs=attrs, used_attrs=used_attrs, mapping=mapping, resolve=resolve, aliases=aliases)
def serve(self): # Configure database for the index Base.metadata.create_all( self.env.getDatabaseEngine("backend-database")) # Store DB session self.__session = self.env.getDatabaseSession("backend-database") # Do a feature check try: self.__session.query(KeyValueIndex).filter( func.levenshtein("foo", "foo") < 2).one_or_none() self.fuzzy = True except: self.__session.rollback() # If there is already a collection, check if there is a newer schema available schema = self.factory.getXMLObjectSchema(True) if self.isSchemaUpdated(schema): self.__session.query(Schema).delete() self.__session.query(KeyValueIndex).delete() self.__session.query(ExtensionIndex).delete() self.__session.query(ObjectInfoIndex).delete() self.log.info( 'object definitions changed, dropped old object index') # Create the initial schema information if required if not self.__session.query(Schema).one_or_none(): self.log.info('created schema') md5s = hashlib.md5() md5s.update(schema) md5sum = md5s.hexdigest() schema = Schema(hash=md5sum) self.__session.add(schema) self.__session.commit() # Schedule index sync if self.env.config.get("backend.index", "True").lower() == "true": import sys if hasattr(sys, '_called_from_test'): self.sync_index() else: sobj = PluginRegistry.getInstance("SchedulerService") sobj.getScheduler().add_date_job(self.sync_index, datetime.datetime.now() + datetime.timedelta(seconds=1), tag='_internal', jobstore='ram') # Extract search aid attrs = {} mapping = {} resolve = {} aliases = {} for otype in self.factory.getObjectTypes(): # Assemble search aid item = self.factory.getObjectSearchAid(otype) if not item: continue typ = item['type'] aliases[typ] = [typ] if not typ in attrs: attrs[typ] = [] if not typ in resolve: resolve[typ] = [] if not typ in mapping: mapping[typ] = dict(dn="dn", title="title", description="description", icon=None) attrs[typ] += item['search'] if 'keyword' in item: aliases[typ] += item['keyword'] if 'map' in item: mapping[typ].update(item['map']) if 'resolve' in item: resolve[typ] += item['resolve'] # Add index for attribute used for filtering and memorize # attributes potentially needed for queries. tmp = [x for x in attrs.values()] used_attrs = list(itertools.chain.from_iterable(tmp)) used_attrs += list( itertools.chain.from_iterable( [x.values() for x in mapping.values()])) used_attrs += list( set( itertools.chain.from_iterable( [[x[0]['filter'], x[0]['attribute']] for x in resolve.values() if len(x)]))) used_attrs = list(set(used_attrs)) # Remove potentially not assigned values used_attrs = [u for u in used_attrs if u] # Memorize search information for later use self.__search_aid = dict(attrs=attrs, used_attrs=used_attrs, mapping=mapping, resolve=resolve, aliases=aliases)
def check_phash(db, name): return db.query(BadPic).filter( func.levenshtein(BadPic.phash, hex2bin(str(imagehash.phash(Image.open(name))))) < 10 ).first()
def file_upload(request): # If we're in read-only mode, let upload clients know if request.flags.enabled("read-only"): raise _exc_with_message( HTTPForbidden, "Read-only mode: Uploads are temporarily disabled" ) # Before we do anything, if there isn't an authenticated user with this # request, then we'll go ahead and bomb out. if request.authenticated_userid is None: raise _exc_with_message( HTTPForbidden, "Invalid or non-existent authentication information." ) # Ensure that user has a verified, primary email address. This should both # reduce the ease of spam account creation and activty, as well as act as # a forcing function for https://github.com/pypa/warehouse/issues/3632. # TODO: Once https://github.com/pypa/warehouse/issues/3632 has been solved, # we might consider a different condition, possibly looking at # User.is_active instead. if not (request.user.primary_email and request.user.primary_email.verified): raise _exc_with_message( HTTPBadRequest, ( "User {!r} does not have a verified primary email address. " "Please add a verified primary email before attempting to " "upload to PyPI. See {project_help} for more information." "for more information." ).format( request.user.username, project_help=request.help_url(_anchor="verified-email"), ), ) from None # Do some cleanup of the various form fields for key in list(request.POST): value = request.POST.get(key) if isinstance(value, str): # distutils "helpfully" substitutes unknown, but "required" values # with the string "UNKNOWN". This is basically never what anyone # actually wants so we'll just go ahead and delete anything whose # value is UNKNOWN. if value.strip() == "UNKNOWN": del request.POST[key] # Escape NUL characters, which psycopg doesn't like if "\x00" in value: request.POST[key] = value.replace("\x00", "\\x00") # We require protocol_version 1, it's the only supported version however # passing a different version should raise an error. if request.POST.get("protocol_version", "1") != "1": raise _exc_with_message(HTTPBadRequest, "Unknown protocol version.") # Check if any fields were supplied as a tuple and have become a # FieldStorage. The 'content' and 'gpg_signature' fields _should_ be a # FieldStorage, however. # ref: https://github.com/pypa/warehouse/issues/2185 # ref: https://github.com/pypa/warehouse/issues/2491 for field in set(request.POST) - {"content", "gpg_signature"}: values = request.POST.getall(field) if any(isinstance(value, FieldStorage) for value in values): raise _exc_with_message(HTTPBadRequest, f"{field}: Should not be a tuple.") # Look up all of the valid classifiers all_classifiers = request.db.query(Classifier).all() # Validate and process the incoming metadata. form = MetadataForm(request.POST) # Add a validator for deprecated classifiers form.classifiers.validators.append(_no_deprecated_classifiers(request)) form.classifiers.choices = [(c.classifier, c.classifier) for c in all_classifiers] if not form.validate(): for field_name in _error_message_order: if field_name in form.errors: break else: field_name = sorted(form.errors.keys())[0] if field_name in form: field = form[field_name] if field.description and isinstance(field, wtforms.StringField): error_message = ( "{value!r} is an invalid value for {field}. ".format( value=field.data, field=field.description ) + "Error: {} ".format(form.errors[field_name][0]) + "See " "https://packaging.python.org/specifications/core-metadata" ) else: error_message = "Invalid value for {field}. Error: {msgs[0]}".format( field=field_name, msgs=form.errors[field_name] ) else: error_message = "Error: {}".format(form.errors[field_name][0]) raise _exc_with_message(HTTPBadRequest, error_message) # Ensure that we have file data in the request. if "content" not in request.POST: raise _exc_with_message(HTTPBadRequest, "Upload payload does not have a file.") # Look up the project first before doing anything else, this is so we can # automatically register it if we need to and can check permissions before # going any further. try: project = ( request.db.query(Project) .filter( Project.normalized_name == func.normalize_pep426_name(form.name.data) ) .one() ) except NoResultFound: # Check for AdminFlag set by a PyPI Administrator disabling new project # registration, reasons for this include Spammers, security # vulnerabilities, or just wanting to be lazy and not worry ;) if request.flags.enabled("disallow-new-project-registration"): raise _exc_with_message( HTTPForbidden, ( "New project registration temporarily disabled. " "See {projecthelp} for details" ).format(projecthelp=request.help_url(_anchor="admin-intervention")), ) from None # Before we create the project, we're going to check our blacklist to # see if this project is even allowed to be registered. If it is not, # then we're going to deny the request to create this project. if request.db.query( exists().where( BlacklistedProject.name == func.normalize_pep426_name(form.name.data) ) ).scalar(): raise _exc_with_message( HTTPBadRequest, ( "The name {name!r} isn't allowed. " "See {projecthelp} " "for more information." ).format( name=form.name.data, projecthelp=request.help_url(_anchor="project-name"), ), ) from None # Also check for collisions with Python Standard Library modules. if packaging.utils.canonicalize_name(form.name.data) in STDLIB_PROHIBITTED: raise _exc_with_message( HTTPBadRequest, ( "The name {name!r} isn't allowed (conflict with Python " "Standard Library module name). See " "{projecthelp} for more information." ).format( name=form.name.data, projecthelp=request.help_url(_anchor="project-name"), ), ) from None # The project doesn't exist in our database, so first we'll check for # projects with a similar name squattees = ( request.db.query(Project) .filter( func.levenshtein( Project.normalized_name, func.normalize_pep426_name(form.name.data) ) <= 2 ) .all() ) # Next we'll create the project project = Project(name=form.name.data) request.db.add(project) # Now that the project exists, add any squats which it is the squatter for for squattee in squattees: request.db.add(Squat(squatter=project, squattee=squattee)) # Then we'll add a role setting the current user as the "Owner" of the # project. request.db.add(Role(user=request.user, project=project, role_name="Owner")) # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add( JournalEntry( name=project.name, action="create", submitted_by=request.user, submitted_from=request.remote_addr, ) ) request.db.add( JournalEntry( name=project.name, action="add Owner {}".format(request.user.username), submitted_by=request.user, submitted_from=request.remote_addr, ) ) # Check that the user has permission to do things to this project, if this # is a new project this will act as a sanity check for the role we just # added above. if not request.has_permission("upload", project): raise _exc_with_message( HTTPForbidden, ( "The user '{0}' isn't allowed to upload to project '{1}'. " "See {2} for more information." ).format( request.user.username, project.name, request.help_url(_anchor="project-name"), ), ) # Uploading should prevent broken rendered descriptions. # Temporarily disabled, see # https://github.com/pypa/warehouse/issues/4079 # if form.description.data: # description_content_type = form.description_content_type.data # if not description_content_type: # description_content_type = "text/x-rst" # rendered = readme.render( # form.description.data, description_content_type, use_fallback=False # ) # if rendered is None: # if form.description_content_type.data: # message = ( # "The description failed to render " # "for '{description_content_type}'." # ).format(description_content_type=description_content_type) # else: # message = ( # "The description failed to render " # "in the default format of reStructuredText." # ) # raise _exc_with_message( # HTTPBadRequest, # "{message} See {projecthelp} for more information.".format( # message=message, # projecthelp=request.help_url(_anchor="description-content-type"), # ), # ) from None try: canonical_version = packaging.utils.canonicalize_version(form.version.data) release = ( request.db.query(Release) .filter( (Release.project == project) & (Release.canonical_version == canonical_version) ) .one() ) except MultipleResultsFound: # There are multiple releases of this project which have the same # canonical version that were uploaded before we checked for # canonical version equivalence, so return the exact match instead release = ( request.db.query(Release) .filter( (Release.project == project) & (Release.version == form.version.data) ) .one() ) except NoResultFound: release = Release( project=project, _classifiers=[ c for c in all_classifiers if c.classifier in form.classifiers.data ], dependencies=list( _construct_dependencies( form, { "requires": DependencyKind.requires, "provides": DependencyKind.provides, "obsoletes": DependencyKind.obsoletes, "requires_dist": DependencyKind.requires_dist, "provides_dist": DependencyKind.provides_dist, "obsoletes_dist": DependencyKind.obsoletes_dist, "requires_external": DependencyKind.requires_external, "project_urls": DependencyKind.project_url, }, ) ), canonical_version=canonical_version, **{ k: getattr(form, k).data for k in { # This is a list of all the fields in the form that we # should pull off and insert into our new release. "version", "summary", "description", "description_content_type", "license", "author", "author_email", "maintainer", "maintainer_email", "keywords", "platform", "home_page", "download_url", "requires_python", } }, uploader=request.user, uploaded_via=request.user_agent, ) request.db.add(release) # TODO: This should be handled by some sort of database trigger or # a SQLAlchemy hook or the like instead of doing it inline in # this view. request.db.add( JournalEntry( name=release.project.name, version=release.version, action="new release", submitted_by=request.user, submitted_from=request.remote_addr, ) ) # TODO: We need a better solution to this than to just do it inline inside # this method. Ideally the version field would just be sortable, but # at least this should be some sort of hook or trigger. releases = ( request.db.query(Release) .filter(Release.project == project) .options(orm.load_only(Release._pypi_ordering)) .all() ) for i, r in enumerate( sorted(releases, key=lambda x: packaging.version.parse(x.version)) ): r._pypi_ordering = i # Pull the filename out of our POST data. filename = request.POST["content"].filename # Make sure that the filename does not contain any path separators. if "/" in filename or "\\" in filename: raise _exc_with_message( HTTPBadRequest, "Cannot upload a file with '/' or '\\' in the name." ) # Make sure the filename ends with an allowed extension. if _dist_file_regexes[project.allow_legacy_files].search(filename) is None: raise _exc_with_message( HTTPBadRequest, "Invalid file extension: Use .egg, .tar.gz, .whl or .zip " "extension. (https://www.python.org/dev/peps/pep-0527)", ) # Make sure that our filename matches the project that it is being uploaded # to. prefix = pkg_resources.safe_name(project.name).lower() if not pkg_resources.safe_name(filename).lower().startswith(prefix): raise _exc_with_message( HTTPBadRequest, "Start filename for {!r} with {!r}.".format(project.name, prefix), ) # Check the content type of what is being uploaded if not request.POST["content"].type or request.POST["content"].type.startswith( "image/" ): raise _exc_with_message(HTTPBadRequest, "Invalid distribution file.") # Ensure that the package filetype is allowed. # TODO: Once PEP 527 is completely implemented we should be able to delete # this and just move it into the form itself. if not project.allow_legacy_files and form.filetype.data not in { "sdist", "bdist_wheel", "bdist_egg", }: raise _exc_with_message(HTTPBadRequest, "Unknown type of file.") # The project may or may not have a file size specified on the project, if # it does then it may or may not be smaller or larger than our global file # size limits. file_size_limit = max(filter(None, [MAX_FILESIZE, project.upload_limit])) with tempfile.TemporaryDirectory() as tmpdir: temporary_filename = os.path.join(tmpdir, filename) # Buffer the entire file onto disk, checking the hash of the file as we # go along. with open(temporary_filename, "wb") as fp: file_size = 0 file_hashes = { "md5": hashlib.md5(), "sha256": hashlib.sha256(), "blake2_256": hashlib.blake2b(digest_size=256 // 8), } for chunk in iter(lambda: request.POST["content"].file.read(8096), b""): file_size += len(chunk) if file_size > file_size_limit: raise _exc_with_message( HTTPBadRequest, "File too large. " + "Limit for project {name!r} is {limit} MB. ".format( name=project.name, limit=file_size_limit // (1024 * 1024) ) + "See " + request.help_url(_anchor="file-size-limit"), ) fp.write(chunk) for hasher in file_hashes.values(): hasher.update(chunk) # Take our hash functions and compute the final hashes for them now. file_hashes = {k: h.hexdigest().lower() for k, h in file_hashes.items()} # Actually verify the digests that we've gotten. We're going to use # hmac.compare_digest even though we probably don't actually need to # because it's better safe than sorry. In the case of multiple digests # we expect them all to be given. if not all( [ hmac.compare_digest( getattr(form, "{}_digest".format(digest_name)).data.lower(), digest_value, ) for digest_name, digest_value in file_hashes.items() if getattr(form, "{}_digest".format(digest_name)).data ] ): raise _exc_with_message( HTTPBadRequest, "The digest supplied does not match a digest calculated " "from the uploaded file.", ) # Check to see if the file that was uploaded exists already or not. is_duplicate = _is_duplicate_file(request.db, filename, file_hashes) if is_duplicate: return Response() elif is_duplicate is not None: raise _exc_with_message( HTTPBadRequest, # Note: Changing this error message to something that doesn't # start with "File already exists" will break the # --skip-existing functionality in twine # ref: https://github.com/pypa/warehouse/issues/3482 # ref: https://github.com/pypa/twine/issues/332 "File already exists. See " + request.help_url(_anchor="file-name-reuse"), ) # Check to see if the file that was uploaded exists in our filename log if request.db.query( request.db.query(Filename).filter(Filename.filename == filename).exists() ).scalar(): raise _exc_with_message( HTTPBadRequest, "This filename has already been used, use a " "different version. " "See " + request.help_url(_anchor="file-name-reuse"), ) # Check to see if uploading this file would create a duplicate sdist # for the current release. if ( form.filetype.data == "sdist" and request.db.query( request.db.query(File) .filter((File.release == release) & (File.packagetype == "sdist")) .exists() ).scalar() ): raise _exc_with_message( HTTPBadRequest, "Only one sdist may be uploaded per release." ) # Check the file to make sure it is a valid distribution file. if not _is_valid_dist_file(temporary_filename, form.filetype.data): raise _exc_with_message(HTTPBadRequest, "Invalid distribution file.") # Check that if it's a binary wheel, it's on a supported platform if filename.endswith(".whl"): wheel_info = _wheel_file_re.match(filename) plats = wheel_info.group("plat").split(".") for plat in plats: if not _valid_platform_tag(plat): raise _exc_with_message( HTTPBadRequest, "Binary wheel '{filename}' has an unsupported " "platform tag '{plat}'.".format(filename=filename, plat=plat), ) # Also buffer the entire signature file to disk. if "gpg_signature" in request.POST: has_signature = True with open(os.path.join(tmpdir, filename + ".asc"), "wb") as fp: signature_size = 0 for chunk in iter( lambda: request.POST["gpg_signature"].file.read(8096), b"" ): signature_size += len(chunk) if signature_size > MAX_SIGSIZE: raise _exc_with_message(HTTPBadRequest, "Signature too large.") fp.write(chunk) # Check whether signature is ASCII armored with open(os.path.join(tmpdir, filename + ".asc"), "rb") as fp: if not fp.read().startswith(b"-----BEGIN PGP SIGNATURE-----"): raise _exc_with_message( HTTPBadRequest, "PGP signature isn't ASCII armored." ) else: has_signature = False # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add(Filename(filename=filename)) # Store the information about the file in the database. file_ = File( release=release, filename=filename, python_version=form.pyversion.data, packagetype=form.filetype.data, comment_text=form.comment.data, size=file_size, has_signature=bool(has_signature), md5_digest=file_hashes["md5"], sha256_digest=file_hashes["sha256"], blake2_256_digest=file_hashes["blake2_256"], # Figure out what our filepath is going to be, we're going to use a # directory structure based on the hash of the file contents. This # will ensure that the contents of the file cannot change without # it also changing the path that the file is saved too. path="/".join( [ file_hashes[PATH_HASHER][:2], file_hashes[PATH_HASHER][2:4], file_hashes[PATH_HASHER][4:], filename, ] ), uploaded_via=request.user_agent, ) request.db.add(file_) # TODO: This should be handled by some sort of database trigger or a # SQLAlchemy hook or the like instead of doing it inline in this # view. request.db.add( JournalEntry( name=release.project.name, version=release.version, action="add {python_version} file {filename}".format( python_version=file_.python_version, filename=file_.filename ), submitted_by=request.user, submitted_from=request.remote_addr, ) ) # TODO: We need a better answer about how to make this transactional so # this won't take affect until after a commit has happened, for # now we'll just ignore it and save it before the transaction is # committed. storage = request.find_service(IFileStorage) storage.store( file_.path, os.path.join(tmpdir, filename), meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, "package-type": file_.packagetype, "python-version": file_.python_version, }, ) if has_signature: storage.store( file_.pgp_path, os.path.join(tmpdir, filename + ".asc"), meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, "package-type": file_.packagetype, "python-version": file_.python_version, }, ) return Response()