def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})" dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: assert self.config.maxDocumentSize is not None collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", use_random_sampling=self.config.useRandomSampling, max_document_size=self.config.maxDocumentSize, is_version_gte_4_4=self.is_server_version_gte_4_4(), sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] max_schema_size = self.config.maxSchemaSize collection_schema_size = len(collection_schema.values()) collection_fields: Union[ List[SchemaDescription], ValuesView[ SchemaDescription]] = collection_schema.values() assert max_schema_size is not None if collection_schema_size > max_schema_size: # downsample the schema, using frequency as the sort key self.report.report_warning( key=dataset_urn, reason= f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}", ) collection_fields = sorted( collection_schema.values(), key=lambda x: x["count"], reverse=True, )[0:max_schema_size] # Add this information to the custom properties so user can know they are looking at downsampled schema dataset_properties.customProperties[ "schema.downsampled"] = "True" dataset_properties.customProperties[ "schema.totalFields"] = f"{collection_schema_size}" logger.debug( f"Size of collection fields = {len(collection_fields)}" ) # append each schema field (sort so output is consistent) for schema_field in sorted( collection_fields, key=lambda x: x["delimited_name"]): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name), type=self.get_field_type(schema_field["type"], dataset_name), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"] ): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name ), type=self.get_field_type( schema_field["type"], dataset_name ), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu