from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, register_custom_type, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( DateTypeClass, NullTypeClass, NumberTypeClass, SchemaField, TimeTypeClass, ) register_custom_type(HiveDate, DateTypeClass) register_custom_type(HiveTimestamp, TimeTypeClass) register_custom_type(HiveDecimal, NumberTypeClass) class HiveConfig(BasicSQLAlchemyConfig): # defaults scheme = "hive" # Hive SQLAlchemy connector returns views as tables. # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. # Disabling views helps us prevent this duplication. include_views = False class HiveSource(SQLAlchemySource):
query = textwrap.dedent(query) + audit_log_filter return textwrap.dedent(query) def get_view_definition(self, connection, view_name, schema=None, **kw): view = self._get_table(connection, view_name, schema) return view.view_query pybigquery.sqlalchemy_bigquery.BigQueryDialect.get_view_definition = get_view_definition # Handle the GEOGRAPHY type. We will temporarily patch the _type_map # in the get_workunits method of the source. GEOGRAPHY = make_sqlalchemy_type("GEOGRAPHY") register_custom_type(GEOGRAPHY) assert pybigquery.sqlalchemy_bigquery._type_map class BigQueryCredential(ConfigModel): project_id: str private_key_id: str private_key: str client_email: str client_id: str auth_uri: str = "https://accounts.google.com/o/oauth2/auth" token_uri: str = "https://oauth2.googleapis.com/token" auth_provider_x509_cert_url: str = "https://www.googleapis.com/oauth2/v1/certs" type: str = "service_account" client_x509_cert_url: Optional[str]
SqlWorkUnit, TimeTypeClass, register_custom_type, ) from datahub.ingestion.source_config.sql.snowflake import SnowflakeConfig from datahub.ingestion.source_report.sql.snowflake import SnowflakeReport from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, UpstreamClass, UpstreamLineage, ) from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ChangeTypeClass, DatasetPropertiesClass register_custom_type(custom_types.TIMESTAMP_TZ, TimeTypeClass) register_custom_type(custom_types.TIMESTAMP_LTZ, TimeTypeClass) register_custom_type(custom_types.TIMESTAMP_NTZ, TimeTypeClass) register_custom_type(custom_types.VARIANT, RecordTypeClass) logger: logging.Logger = logging.getLogger(__name__) snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType class SnowflakeSource(SQLAlchemySource): def __init__(self, config: SnowflakeConfig, ctx: PipelineContext): super().__init__(config, ctx, "snowflake") self._lineage_map: Optional[Dict[str, List[Tuple[str, str, str]]]] = None self._external_lineage_map: Optional[Dict[str, Set[str]]] = None
config_class, platform_name, support_status, ) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, register_custom_type, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayTypeClass, BytesTypeClass, MapTypeClass, ) register_custom_type(custom_types.ARRAY, ArrayTypeClass) register_custom_type(custom_types.JSON, BytesTypeClass) register_custom_type(custom_types.JSONB, BytesTypeClass) register_custom_type(custom_types.HSTORE, MapTypeClass) class PostgresConfig(BasicSQLAlchemyConfig): # defaults scheme = Field(default="postgresql+psycopg2", description="database scheme") schema_pattern = Field(default=AllowDenyPattern( deny=["information_schema"])) def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: regular = f"{schema}.{table}"
duration = relativedelta(hours=1) if not partition_datetime: partition_datetime = datetime.datetime.strptime( partition_id, "%Y%m%d%H") else: raise ValueError( f"check your partition_id {partition_id}. It must be yearly/monthly/daily/hourly." ) upper_bound_partition_datetime = partition_datetime + duration return partition_datetime, upper_bound_partition_datetime # Handle the GEOGRAPHY type. We will temporarily patch the _type_map # in the get_workunits method of the source. GEOGRAPHY = make_sqlalchemy_type("GEOGRAPHY") register_custom_type(GEOGRAPHY) assert sqlalchemy_bigquery._types._type_map # STRUCT is a custom sqlalchemy data type defined by the sqlalchemy_bigquery library # https://github.com/googleapis/python-bigquery-sqlalchemy/blob/934e25f705fd9f226e438d075c7e00e495cce04e/sqlalchemy_bigquery/_types.py#L47 register_custom_type(sqlalchemy_bigquery.STRUCT, output=RecordTypeClass) @dataclass class BigQueryPartitionColumn: table_catalog: str table_schema: str table_name: str column_name: str data_type: str partition_id: str
MapTypeClass, NumberTypeClass, RecordTypeClass, SchemaField, ) if sys.version_info >= (3, 7): # noqa: C901 # This import verifies that the dependencies are available. import sqlalchemy_trino # noqa: F401 from sqlalchemy import exc, sql from sqlalchemy.engine import reflection from sqlalchemy.sql import sqltypes from sqlalchemy_trino import datatype, error from sqlalchemy_trino.dialect import TrinoDialect register_custom_type(datatype.ROW, RecordTypeClass) register_custom_type(datatype.MAP, MapTypeClass) register_custom_type(datatype.DOUBLE, NumberTypeClass) # Read only table names and skip view names, as view names will also be returned # from get_view_names @reflection.cache # type: ignore def get_table_names(self, connection, schema: str = None, **kw): # type: ignore schema = schema or self._get_default_schema_name(connection) if schema is None: raise exc.NoSuchTableError("schema is required") query = dedent( """ SELECT "table_name" FROM "information_schema"."tables" WHERE "table_schema" = :schema and "table_type" != 'VIEW'
base.ischema_names["DateTime64(3)"] = DATETIME base.ischema_names["DateTime64(4)"] = DATETIME base.ischema_names["DateTime64(5)"] = DATETIME base.ischema_names["DateTime64(6)"] = DATETIME base.ischema_names["DateTime64(7)"] = DATETIME base.ischema_names["DateTime64(8)"] = DATETIME base.ischema_names["DateTime64(9)"] = DATETIME base.ischema_names["Date32"] = DATE base.ischema_names["Bool"] = BOOLEAN base.ischema_names["Nothing"] = sqltypes.NullType base.ischema_names["Int128"] = INTEGER base.ischema_names["Int256"] = INTEGER base.ischema_names["UInt128"] = INTEGER base.ischema_names["UInt256"] = INTEGER register_custom_type(custom_types.common.Array, ArrayTypeClass) register_custom_type(custom_types.ip.IPv4, NumberTypeClass) register_custom_type(custom_types.ip.IPv6, StringTypeClass) register_custom_type(custom_types.common.Map, MapTypeClass) register_custom_type(custom_types.common.Tuple, UnionTypeClass) class LineageCollectorType(Enum): TABLE = "table" VIEW = "view" MATERIALIZED_VIEW = "materialized_view" class LineageDatasetPlatform(Enum): CLICKHOUSE = "clickhouse"
import pymysql # noqa: F401 from sqlalchemy.dialects.mysql import base from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, make_sqlalchemy_type, register_custom_type, ) GEOMETRY = make_sqlalchemy_type("GEOMETRY") POINT = make_sqlalchemy_type("POINT") LINESTRING = make_sqlalchemy_type("LINESTRING") POLYGON = make_sqlalchemy_type("POLYGON") register_custom_type(GEOMETRY) register_custom_type(POINT) register_custom_type(LINESTRING) register_custom_type(POLYGON) base.ischema_names["geometry"] = GEOMETRY base.ischema_names["point"] = POINT base.ischema_names["linestring"] = LINESTRING base.ischema_names["polygon"] = POLYGON class MySQLConfig(BasicSQLAlchemyConfig): # defaults host_port = "localhost:3306" scheme = "mysql+pymysql"