def to_dbref(updater: DocumentUpdater): """Convert references (ObjectId, manual ref, dynamic ref) to dbref """ def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name not in doc or doc[updater.field_name] is None: return f = doc[updater.field_name] if isinstance(f, bson.DBRef): # Already DBRef return elif isinstance(f, str): # ObjectId as string try: f = bson.ObjectId(f) except bson.errors.BSONError: pass collection_name = ctx.collection.name if ctx.collection is not None else None is_dict = isinstance(f, dict) if is_dict and isinstance(f.get('_id'), bson.ObjectId): # manual ref doc[updater.field_name] = bson.DBRef(collection_name, f['_id']) elif is_dict and isinstance(f.get('_ref'), bson.DBRef): # dynamic ref doc[updater.field_name] = f['_ref'] elif isinstance(f, bson.ObjectId): doc[updater.field_name] = bson.DBRef(collection_name, f) elif updater.migration_policy.name == 'strict': # Other data type raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be DBRef, ObjectId, manual ref, dynamic ref, " f"ObjectId string) in record {doc}") # TODO: precheck if field actually contains value other than DBRef updater.update_by_document(by_doc)
def __decrease_geojson_nesting(updater: DocumentUpdater, from_type: str, to_type: str, depth: int = 1): """ Extract the first element from nested arrays in `coordinates` field on GeoJSON fields with given type :param updater: DocumentUpdater object :param from_type: GeoJSON type to change :param to_type: this GeoJSON type will be set in changed records :param depth: nested array depth to extract from :return: """ assert depth > 0 def by_doc(ctx: ByDocContext): doc = ctx.document if isinstance(doc.get(updater.field_name), dict): match = doc[updater.field_name].get('type') == from_type \ and doc[updater.field_name].get('coordinates') if match: doc[updater.field_name]['coordinates'] = functools.reduce( lambda x, y: x[0] if x and isinstance(x, (list, tuple)) else None, range(depth), doc[updater.field_name].get('coordinates', [.0, .0])) updater.update_by_document(by_doc)
def to_string(updater: DocumentUpdater): def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name not in doc or doc[updater.field_name] is None: return f = doc[updater.field_name] is_dict = isinstance(f, dict) if is_dict and isinstance(f.get('_ref'), bson.DBRef): # dynamic ref doc[updater.field_name] = str(f['_ref'].id) elif is_dict and isinstance(f.get('_id'), bson.ObjectId): # manual ref doc[updater.field_name] = str(f['_id']) elif isinstance(f, bson.DBRef): doc[updater.field_name] = str(f.id) else: try: doc[updater.field_name] = str(f) except (TypeError, ValueError) as e: if updater.migration_policy.name == 'strict': raise MigrationError( f'Cannot convert value {updater.field_name}: ' f'{doc[updater.field_name]} to string') from e # TODO: precheck if field actually contains value other than string updater.update_by_document(by_doc)
def __increase_geojson_nesting(updater: DocumentUpdater, from_type: str, to_type: str, depth: int = 1): """ Wraps `coordinates` field into nested array on GeoJSON fields with given type. :param updater: DocumentUpdater object :param from_type: GeoJSON type to change :param to_type: this GeoJSON type will be set in changed records :param depth: nested array depth to wrap in :return: """ assert depth > 0 def by_doc(ctx: ByDocContext): doc = ctx.document if isinstance(doc.get(updater.field_name), dict): match = doc[updater.field_name].get('type') == from_type \ and doc[updater.field_name].get('coordinates') if match: doc[updater.field_name]['coordinates'] = functools.reduce( lambda x, y: [x], range(depth), doc[updater.field_name].get('coordinates', [.0, .0])) updater.update_by_document(by_doc)
def __check_value_types(updater: DocumentUpdater, allowed_types: List[str]): """ Check if given field contains only given types of value. Raise InconsistencyError if other value types was found :param updater: :param allowed_types: :return: """ def by_path(ctx: ByPathContext): # Check for data types other than objects or arrays fltr = { "$and": [ { ctx.filter_dotpath: { "$ne": None } }, *[{ k: v } for k, v in ctx.extra_filter.items()], # $expr >= 3.6, $type >= 3.4 { "$expr": { "$not": [{ "$in": [{ "$type": f'${ctx.filter_dotpath}' }, allowed_types] }] } } ] } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) def by_doc(ctx: ByDocContext): # https://docs.mongodb.com/manual/reference/operator/aggregation/convert/ type_map = { 'double': float, 'string': str, 'objectId': bson.ObjectId, 'bool': bool, 'date': datetime, 'int': int, 'long': int, 'decimal': float } assert set(allowed_types) < type_map.keys() doc = ctx.document if updater.field_name in doc: f = doc[updater.field_name] valid_types = tuple(type_map[t] for t in allowed_types) valid = f is None or isinstance(f, valid_types) if not valid: raise InconsistencyError( f"Field {updater.field_name} has wrong type of value " f"{f!r} (should be any of {valid_types}) in record {doc}") updater.update_combined(by_path, by_doc, False, False)
def item_to_list(updater: DocumentUpdater, remove_cls_key=False): """Make a list with single element from every non-array value""" def by_doc(ctx: ByDocContext): if updater.field_name in ctx.document: f = ctx.document[updater.field_name] if f is not None: if remove_cls_key and isinstance(f, dict) and '_cls' in f: del f['_cls'] if not isinstance(f, (list, tuple)): ctx.document[updater.field_name] = [f] else: ctx.document[updater.field_name] = [] # null -> [] updater.update_by_document(by_doc)
def __check_geojson_objects(updater: DocumentUpdater, geojson_types: List[str]): """ Check if all object values in field are GeoJSON objects of given types. Raise InconsistencyError if other objects found :param updater: :param geojson_types: :return: """ def by_path(ctx: ByPathContext): fltr = { "$and": [ { ctx.filter_dotpath: { "$ne": None } }, *[{ k: v } for k, v in ctx.extra_filter.items()], { f'{ctx.filter_dotpath}.type': { '$nin': geojson_types } }, # $expr >= 3.6 { "$expr": { "$eq": [{ "$type": f'${ctx.filter_dotpath}' }, 'object'] } } ] } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name in doc: f = doc[updater.field_name] valid = f is None or (isinstance(f, dict) and f.get('type') in geojson_types) if not valid: raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be GeoJSON) in record {doc}") updater.update_combined(by_path, by_doc, False, False)
def __check_legacy_point_coordinates(updater: DocumentUpdater): """ Check if all array values in field has legacy geo point coordinates type. Raise InconsistencyError if other arrays was found :param updater: :return: """ def by_path(ctx: ByPathContext): fltr = { "$and": [ { ctx.filter_dotpath: { "$ne": None } }, *[{ k: v } for k, v in ctx.extra_filter.items()], # $expr >= 3.6, $isArray >= 3.2 { "$expr": { "$eq": [{ "$isArray": f"${ctx.filter_dotpath}" }, True] } }, { "$expr": { "$ne": [{ "$size": f"${ctx.filter_dotpath}" }, 2] } }, # $expr >= 3.6 # TODO: add element type check ] } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name in doc: f = doc[updater.field_name] valid = f is None or (isinstance(f, (list, tuple)) and len(f) == 2) if not valid: raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be legacy geo point) in record {doc}") updater.update_combined(by_path, by_doc, False, False)
def remove_cls_key(updater: DocumentUpdater): """Unset '_cls' key in documents if any""" def by_path(ctx: ByPathContext): ctx.collection.update_many( { ctx.filter_dotpath + '._cls': { '$exists': True }, **ctx.extra_filter }, {'$unset': { ctx.update_dotpath + '._cls': '' }}, array_filters=ctx.build_array_filters()) updater.update_by_path(by_path)
def drop_field(updater: DocumentUpdater): """Drop field""" def by_path(ctx: ByPathContext): ctx.collection.update_many( { ctx.filter_dotpath: { '$exists': True }, **ctx.extra_filter }, {'$unset': { ctx.update_dotpath: '' }}, array_filters=ctx.build_array_filters()) updater.update_by_path(by_path)
def test_deny__should_raise_error(test_db, load_fixture): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, 'Schema1Doc1', schema, 'doc1_str', MigrationPolicy.strict) with pytest.raises(MigrationError): converters.deny(updater)
def _run_migration(self, self_schema: Schema.Document, parameters: Mapping[str, Any], swap: bool = False): # Try to process all parameters on same order to avoid # potential problems on repeated launches if some query on # previous launch was failed for name in sorted(parameters.keys() | self_schema.parameters.keys()): left_value = self_schema.parameters.get(name, UNSET) right_value = parameters.get(name, UNSET) if left_value == right_value: continue diff = Diff(old=right_value if swap else left_value, new=left_value if swap else right_value, key=name) log.debug(">> Change %s: %s => %s", repr(name), repr(diff.old), repr(diff.new)) try: method = getattr(self, f'change_{name}') except AttributeError as e: raise SchemaError(f'Unknown document parameter: {name}') from e inherit = self._run_ctx['left_schema'][ self.document_type].parameters.get('inherit') document_cls = document_type_to_class_name( self.document_type) if inherit else None updater = DocumentUpdater(self._run_ctx['db'], self.document_type, self._run_ctx['left_schema'], '', self._run_ctx['migration_policy'], document_cls) method(updater, diff)
def change_inherit(self, updater: DocumentUpdater, diff: Diff): """Remove '_cls' key if EmbeddedDocument becomes non-inherit, otherwise do nothing """ def by_path(ctx: ByPathContext): ctx.collection.update_many( {ctx.filter_dotpath + '._cls': {'$exists': True}, **ctx.extra_filter}, {'$unset': {ctx.update_dotpath + '._cls': ''}}, array_filters=ctx.build_array_filters() ) self._check_diff(diff, False, bool) if diff.new: return updater.update_by_path(by_path)
def test_to_decimal__if_value_does_not_contain_number__should_raise_error( test_db, load_fixture, document_type, field_name): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) with pytest.raises(MigrationError): converters.to_decimal(updater)
def legacy_pairs_to_geojson(updater: DocumentUpdater, to_type: str): """Convert legacy coordinate pairs to GeoJSON objects of given type""" def by_doc(ctx: ByDocContext): doc = ctx.document if isinstance(doc.get(updater.field_name), (list, tuple)): doc[updater.field_name] = { 'type': 'Point', 'coordinates': doc[updater.field_name] } if updater.migration_policy.name == 'strict': __check_geojson_objects(updater, ['Point', to_type]) __check_legacy_point_coordinates(updater) __check_value_types(updater, ['object', 'array']) updater.update_by_document(by_doc) convert_geojson(updater, 'Point', to_type)
def test_extract_from_list__if_value_is_not_list__should_raise_error( test_db, load_fixture, document_type, field_name, dump_db): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) with pytest.raises(MigrationError): converters.extract_from_list(updater, int)
def geojson_to_legacy_pairs(updater: DocumentUpdater, from_type: str): """Convert GeoJSON objects of given type to legacy coordinate pairs""" def by_doc(ctx: ByDocContext): doc = ctx.document if isinstance(doc.get(updater.field_name), dict): if 'Point' in doc[updater.field_name]: doc[updater.field_name] = doc[updater.field_name].get( 'coordinates') if updater.migration_policy.name == 'strict': __check_geojson_objects(updater, ["Point", from_type]) __check_legacy_point_coordinates(updater) __check_value_types(updater, ['object', 'array']) convert_geojson(updater, from_type, 'Point') updater.update_by_document(by_doc)
def to_url_string(updater: DocumentUpdater, check_only=False): """Cast fields to string and then verify if they contain URLs""" def by_path(ctx: ByPathContext): fltr = { ctx.filter_dotpath: { '$not': url_regex, '$ne': None }, **ctx.extra_filter } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) to_string(updater) url_regex = re.compile( r"\A[A-Z]{3,}://[A-Z0-9\-._~:/?#\[\]@!$&'()*+,;%=]\Z", re.IGNORECASE) if updater.migration_policy.name == 'strict': updater.update_by_path(by_path)
def to_email_string(updater: DocumentUpdater): def by_path(ctx: ByPathContext): fltr = { ctx.filter_dotpath: { '$not': email_regex, '$ne': None }, **ctx.extra_filter } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) to_string(updater) email_regex = re.compile( r"\A.*\Z", # TODO: insert email validation regex here re.IGNORECASE) if updater.migration_policy.name == 'strict': updater.update_by_path(by_path)
def change_dynamic(self, updater: DocumentUpdater, diff: Diff): """If document becomes non-dynamic then remove fields which are not defined in mongoengine EmbeddedDocument """ def by_doc(ctx: ByDocContext): extra_keys = ctx.document.keys() - self_schema.keys() if extra_keys: newdoc = {k: v for k, v in ctx.document.items() if k in self_schema.keys()} ctx.document.clear() ctx.document.update(newdoc) self._check_diff(diff, False, bool) if diff.new: return # Nothing to do # Remove fields which are not in schema self_schema = self._run_ctx['left_schema'][self.document_type] # type: Schema.Document updater.update_by_document(by_doc)
def to_complex_datetime(updater: DocumentUpdater): def by_path(ctx: ByPathContext): fltr = { ctx.filter_dotpath: { '$not': regex, '$ne': None }, **ctx.extra_filter } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) to_string(updater) # We should not know which separator is used, so use '.+' # Separator change is handled by appropriate field method regex = r'\A' + str( '.+'.join([r"\d{4}"] + [r"\d{2}"] * 5 + [r"\d{6}"])) + r'\Z' if updater.migration_policy.name == 'strict': updater.update_by_path(by_path)
def to_email_string(updater: DocumentUpdater): def by_path(ctx: ByPathContext): email_regex = r"\A[^\W][A-Z0-9._%+-]+@[\p{L}0-9.-]+\.\p{L}+\Z" fltr = { ctx.filter_dotpath: { '$not': { '$regex': email_regex, '$options': 'i' }, '$ne': None }, **ctx.extra_filter } check_empty_result(ctx.collection, ctx.filter_dotpath, fltr) to_string(updater) if updater.migration_policy.name == 'strict': updater.update_by_path(by_path)
def __mongo_convert(updater: DocumentUpdater, target_type: str): """ Convert field to a given type in a given collection. `target_type` contains MongoDB type name, such as 'string', 'decimal', etc. https://docs.mongodb.com/manual/reference/operator/aggregation/convert/ :param updater: DocumentUpdater object :param target_type: MongoDB type name :return: """ def by_doc(ctx: ByDocContext): # https://docs.mongodb.com/manual/reference/operator/aggregation/convert/ type_map = { 'double': float, 'string': str, 'objectId': bson.ObjectId, 'bool': bool, 'date': lambda x: dateutil_parse(str(x)), 'int': int, 'long': bson.Int64, 'decimal': float, 'binary': bson.Binary, 'object': dict } assert target_type in type_map doc = ctx.document field_name = updater.field_name if field_name in doc: t = type_map[target_type] if not isinstance(doc[field_name], t) and doc[field_name] is not None: try: doc[field_name] = type_map[target_type](doc[field_name]) except (TypeError, ValueError) as e: if updater.migration_policy.name == 'strict': raise MigrationError( f'Cannot convert value ' f'{field_name}: {doc[field_name]} to type {t}' ) from e updater.update_by_document(by_doc)
def test_item_to_list__if_value_is_list__should_wrap_value_in_a_list_with_single_element( test_db, load_fixture, document_type, field_name, dump_db): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) expect = dump_db() converters.item_to_list(updater) assert dump_db() == expect
def to_uuid_bin(updater: DocumentUpdater): """Convert strings with UUID to binData with UUID""" def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name not in doc or doc[updater.field_name] is None: return f = doc[updater.field_name] if isinstance(f, uuid.UUID): return elif isinstance(f, str) and uuid_pattern.match(f): doc[updater.field_name] = uuid.UUID(f) elif updater.migration_policy.name == 'strict': raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be UUID string or UUID Binary data) in record {doc}") uuid_pattern = re.compile( r'\A[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12}\Z', re.IGNORECASE) updater.update_by_document(by_doc)
def extract_from_list(updater: DocumentUpdater, item_type, remove_cls_key=False): """ Replace every list which was met with its first element with checking item type. If type is other than `item_type` then the error will be raised :param updater: :param item_type: python type(s) to check the element :param remove_cls_key: if True then '_cls' keys will be removed from dict items if any :return: """ def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name not in doc or doc[updater.field_name] is None: return f = doc[updater.field_name] if isinstance(f, (list, tuple)): if f: f = f[0] if remove_cls_key and isinstance(f, dict) and '_cls' in f: del f['_cls'] if not isinstance( f, item_type ) and updater.migration_policy.name == 'strict': raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be {item_type}) in record {doc}") else: f = None doc[updater.field_name] = f elif f is not None and updater.migration_policy.name == 'strict': raise MigrationError( f'Could not extract item from non-list value ' f'{updater.field_name}: {doc[updater.field_name]}') updater.update_by_document(by_doc)
def to_dynamic_ref(updater: DocumentUpdater): """Convert references (ObjectId, DBRef, manual ref) to dynamic ref """ def by_doc(ctx: ByDocContext): doc = ctx.document if updater.field_name not in doc or doc[updater.field_name] is None: return f = doc[updater.field_name] is_dict = isinstance(f, dict) collection_name = ctx.collection.name if ctx.collection is not None else None if isinstance(f, str): # ObjectId as string try: f = bson.ObjectId(f) except bson.errors.BSONError: pass # We cannot get dynamic ref from other types of refs because # of lack of '_cls' value. Mongoengine fields which use this # converter can keep DBRef. So return DBRef instead if is_dict and isinstance(f.get('_ref'), bson.DBRef): # Already dynamic ref return elif isinstance(f, bson.DBRef): return elif is_dict and isinstance(f.get('_id'), bson.ObjectId): # manual ref doc[updater.field_name] = bson.DBRef(collection_name, f['_id']) elif isinstance(f, bson.ObjectId): doc[updater.field_name] = bson.DBRef(collection_name, f) elif updater.migration_policy.name == 'strict': # Other data type raise InconsistencyError( f"Field {updater.field_name} has wrong value {f!r} " f"(should be DBRef, ObjectId, manual ref, dynamic ref) " f"in record {doc}") # TODO: precheck if field actually contains value other than dynamic ref or DBRef updater.update_by_document(by_doc)
def test_drop_field__should_drop_field(test_db, load_fixture, document_type, field_name, dump_db): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) expect = dump_db() parsers = load_fixture('schema1').get_embedded_jsonpath_parsers( document_type) for doc in itertools.chain.from_iterable(p.find(expect) for p in parsers): del doc.value[field_name] converters.drop_field(updater) assert dump_db() == expect
def test_item_to_list__if_value_is_not_a_list__should_wrap_value_in_a_list_with_single_element( test_db, load_fixture, document_type, field_name, dump_db): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) expect = dump_db() parsers = load_fixture('schema1').get_embedded_jsonpath_parsers( document_type) for doc in itertools.chain.from_iterable(p.find(expect) for p in parsers): doc.value[field_name] = [doc.value[field_name]] converters.item_to_list(updater) assert dump_db() == expect
def test_extract_from_list__should_extract_the_first_value_from_list( test_db, load_fixture, document_type, field_name, dump_db): schema = load_fixture('schema1').get_schema() updater = DocumentUpdater(test_db, document_type, schema, field_name, MigrationPolicy.strict) expect = dump_db() parsers = load_fixture('schema1').get_embedded_jsonpath_parsers( document_type) for doc in itertools.chain.from_iterable(p.find(expect) for p in parsers): if len(doc.value[field_name]): doc.value[field_name] = doc.value[field_name][0] else: doc.value[field_name] = None converters.extract_from_list(updater, int) assert dump_db() == expect