Exemplo n.º 1
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})"

                dataset_snapshot = DatasetSnapshot(
                    urn=dataset_urn,
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:
                    assert self.config.maxDocumentSize is not None
                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        use_random_sampling=self.config.useRandomSampling,
                        max_document_size=self.config.maxDocumentSize,
                        is_version_gte_4_4=self.is_server_version_gte_4_4(),
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []
                    max_schema_size = self.config.maxSchemaSize
                    collection_schema_size = len(collection_schema.values())
                    collection_fields: Union[
                        List[SchemaDescription], ValuesView[
                            SchemaDescription]] = collection_schema.values()
                    assert max_schema_size is not None
                    if collection_schema_size > max_schema_size:
                        # downsample the schema, using frequency as the sort key
                        self.report.report_warning(
                            key=dataset_urn,
                            reason=
                            f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}",
                        )
                        collection_fields = sorted(
                            collection_schema.values(),
                            key=lambda x: x["count"],
                            reverse=True,
                        )[0:max_schema_size]
                        # Add this information to the custom properties so user can know they are looking at downsampled schema
                        dataset_properties.customProperties[
                            "schema.downsampled"] = "True"
                        dataset_properties.customProperties[
                            "schema.totalFields"] = f"{collection_schema_size}"

                    logger.debug(
                        f"Size of collection fields = {len(collection_fields)}"
                    )
                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                            collection_fields,
                            key=lambda x: x["delimited_name"]):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name),
                            type=self.get_field_type(schema_field["type"],
                                                     dataset_name),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Exemplo n.º 2
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:

                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []

                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                        collection_schema.values(), key=lambda x: x["delimited_name"]
                    ):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name
                            ),
                            type=self.get_field_type(
                                schema_field["type"], dataset_name
                            ),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    actor = "urn:li:corpuser:etl"
                    sys_time = int(time.time() * 1000)
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        created=AuditStamp(time=sys_time, actor=actor),
                        lastModified=AuditStamp(time=sys_time, actor=actor),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu