def test_get_column_type_contains_key(self): field_type = "char" data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field") self.assertEqual( data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj() )
def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: datastore = ujson.load(file) if not isinstance(datastore, list): datastore = [datastore] schema = construct_schema(datastore, delimiter=".") fields: List[SchemaField] = [] for schema_field in sorted(schema.values(), key=lambda x: x["delimited_name"]): mapped_type = _field_type_mapping.get(schema_field["type"], NullTypeClass) native_type = schema_field["type"] if isinstance(native_type, type): native_type = native_type.__name__ field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=native_type, type=SchemaFieldDataType(type=mapped_type()), nullable=schema_field["nullable"], recursive=False, ) fields.append(field) return fields
def get_schema_metadata_for_custom_sql( self, columns: List[dict]) -> Optional[SchemaMetadata]: schema_metadata = None for field in columns: # Datasource fields fields = [] nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field.get("name", ""), type=SchemaFieldDataType(type=TypeClass()), nativeDataType=nativeDataType, description=field.get("description", ""), ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) return schema_metadata
def test_get_column_type_contains_map(): field_type = "map_hehe" data_type = get_column_type(glue_source(), field_type, "a_table", "a_field") assert data_type.to_obj() == SchemaFieldDataType( type=MapTypeClass()).to_obj()
def test_get_column_type_contains_set(): field_type = "set_yolo" data_type = get_column_type(glue_source(), field_type, "a_table", "a_field") assert data_type.to_obj() == SchemaFieldDataType( type=ArrayTypeClass()).to_obj()
def test_get_column_type_contains_map(self): field_type = "map_hehe" data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field") self.assertEqual( data_type.to_obj(), SchemaFieldDataType(type=MapTypeClass()).to_obj() )
def test_get_column_type_contains_key(): field_type = "char" data_type = get_column_type(glue_source(), field_type, "a_table", "a_field") assert data_type.to_obj() == SchemaFieldDataType( type=StringTypeClass()).to_obj()
def test_get_column_type_contains_set(self): field_type = "set_yolo" data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field") self.assertEqual( data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj() )
def get_column_type( report: SourceReport, dataset_name: str, column_type: str ) -> SchemaFieldDataType: """ Maps known DBT types to datahub types """ column_type_stripped = "" pattern = re.compile(r"[\w ]+") # drop all non alphanumerics match = pattern.match(column_type) if match is not None: column_type_stripped = match.group() TypeClass: Any = None for key in _field_type_mapping.keys(): if key == column_type_stripped: TypeClass = _field_type_mapping[column_type_stripped] break if TypeClass is None: report.report_warning( dataset_name, f"unable to map type {column_type} to metadata schema" ) TypeClass = NullTypeClass return SchemaFieldDataType(type=TypeClass())
def get_column_type( sql_report: SQLSourceReport, dataset_name: str, column_type: Any ) -> SchemaFieldDataType: """ Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types """ TypeClass: Optional[Type] = None for sql_type in _field_type_mapping.keys(): if isinstance(column_type, sql_type): TypeClass = _field_type_mapping[sql_type] break if TypeClass is None: for sql_type in _known_unknown_field_types: if isinstance(column_type, sql_type): TypeClass = NullTypeClass break if TypeClass is None: sql_report.report_warning( dataset_name, f"unable to map type {column_type!r} to metadata schema" ) TypeClass = NullTypeClass return SchemaFieldDataType(type=TypeClass())
def _get_column_type(self, field_type: Union[str, dict]) -> SchemaFieldDataType: tp = field_type if hasattr(tp, "type"): tp = tp.type # type: ignore tp = str(tp) TypeClass: Any = self.field_type_mapping.get(tp) dt = SchemaFieldDataType(type=TypeClass()) return dt
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def get_column_type( glue_source: GlueSource, field_type: str, table_name: str, field_name: str ) -> SchemaFieldDataType: field_type_mapping = { "array": ArrayTypeClass, "bigint": NumberTypeClass, "binary": BytesTypeClass, "boolean": BooleanTypeClass, "char": StringTypeClass, "date": DateTypeClass, "decimal": NumberTypeClass, "double": NumberTypeClass, "float": NumberTypeClass, "int": NumberTypeClass, "integer": NumberTypeClass, "interval": TimeTypeClass, "long": NumberTypeClass, "map": MapTypeClass, "null": NullTypeClass, "set": ArrayTypeClass, "smallint": NumberTypeClass, "string": StringTypeClass, "struct": MapTypeClass, "timestamp": TimeTypeClass, "tinyint": NumberTypeClass, "union": UnionTypeClass, "varchar": StringTypeClass, } field_starts_type_mapping = { "array": ArrayTypeClass, "set": ArrayTypeClass, "map": MapTypeClass, "struct": MapTypeClass, "varchar": StringTypeClass, "decimal": NumberTypeClass, } type_class = None if field_type in field_type_mapping: type_class = field_type_mapping[field_type] else: for key in field_starts_type_mapping: if field_type.startswith(key): type_class = field_starts_type_mapping[key] break if type_class is None: glue_source.report.report_warning( field_type, f"The type '{field_type}' is not recognised for field '{field_name}' in table '{table_name}', setting as StringTypeClass.", ) type_class = StringTypeClass data_type = SchemaFieldDataType(type=type_class()) return data_type
def _get_column_type(field_type) -> SchemaFieldDataType: tp = field_type if hasattr(tp, "type"): tp = tp.type tp = str(tp) TypeClass: Any = _field_type_mapping.get(tp) # Note: we could populate the nestedTypes field for unions and similar fields # for the other types as well. However, since we already populate the nativeDataType # field below, it is mostly ok to leave this as not fully initialized. dt = SchemaFieldDataType(type=TypeClass()) return dt
def test_get_column_type_not_contained(): glue_source_instance = glue_source() field_type = "bad_column_type" data_type = get_column_type(glue_source_instance, field_type, "a_table", "a_field") assert data_type.to_obj() == SchemaFieldDataType(type=StringTypeClass()).to_obj() assert glue_source_instance.report.warnings["bad_column_type"] == [ "The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', " "setting as StringTypeClass." ]
def get_column_type(elastic_column_type: str) -> SchemaFieldDataType: type_class: Optional[ Type] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get( elastic_column_type) if type_class is None: logger.warning( f"Cannot map {elastic_column_type!r} to SchemaFieldDataType, using NullTypeClass." ) type_class = NullTypeClass return SchemaFieldDataType(type=type_class())
def _get_column_type(self, field_type: Union[str, dict], logical_type: str) -> SchemaFieldDataType: tp = field_type if hasattr(tp, "type"): tp = tp.type # type: ignore tp = str(tp) TypeClass: Any = self.field_type_mapping.get(tp) if logical_type is not None: TypeClass = self.field_logical_type_mapping.get( logical_type, TypeClass) dt = SchemaFieldDataType(type=TypeClass()) return dt
def test_get_column_type_not_contained(self): field_type = "bad_column_type" data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field") self.assertEqual( data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj() ) self.assertEqual( self.glue_source.report.warnings["bad_column_type"], [ "The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', " "setting as StringTypeClass." ], )
def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: for (table_urn, (columns, path, is_embedded)) in self.upstream_tables.items(): if not is_embedded and not self.config.ingest_tables_external: logger.error( f"Skipping external table {table_urn} as ingest_tables_external is set to False" ) continue dataset_snapshot = DatasetSnapshot( urn=table_urn, aspects=[], ) if path: # Browse path browse_paths = BrowsePathsClass( paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"] ) dataset_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for table {table_urn}") schema_metadata = None if columns: fields = [] for field in columns: nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description="", nativeDataType=nativeDataType, ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot)
def get_column_type(data_type: str) -> SchemaFieldDataType: type_class: Type = NullTypeClass if CatalogSource.int_pattern.match(data_type) is not None: type_class = NumberTypeClass elif CatalogSource.text_pattern.match(data_type) is not None: type_class = StringTypeClass elif CatalogSource.byte_pattern.match(data_type) is not None: type_class = BytesTypeClass elif CatalogSource.date_pattern.match(data_type) is not None: type_class = DateTypeClass elif CatalogSource.time_pattern.match(data_type) is not None: type_class = TimeTypeClass elif CatalogSource.timestamp_pattern.match(data_type) is not None: type_class = TimeTypeClass return SchemaFieldDataType(type=type_class())
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]: table.infer(limit=max_rows) fields: List[SchemaField] = [] for raw_field in table.schema.fields: mapped_type: Type = tableschema_type_map.get(raw_field.type, NullTypeClass) field = SchemaField( fieldPath=raw_field.name, type=SchemaFieldDataType(mapped_type()), nativeDataType=str(raw_field.type), recursive=False, ) fields.append(field) return fields
def _get_field_type(self, native_type: str) -> SchemaFieldDataType: field_type_mapping = { "date": DateTypeClass, "date_time": TimeTypeClass, "distance": NumberTypeClass, "duration": NumberTypeClass, "location": UnionTypeClass, "number": NumberTypeClass, "string": StringTypeClass, "tier": EnumTypeClass, "time": TimeTypeClass, "unquoted": StringTypeClass, "yesno": BooleanTypeClass, "zipcode": EnumTypeClass, "int": NumberTypeClass, "average": NumberTypeClass, "average_distinct": NumberTypeClass, "count": NumberTypeClass, "count_distinct": NumberTypeClass, "list": ArrayTypeClass, "max": NumberTypeClass, "median": NumberTypeClass, "median_distinct": NumberTypeClass, "min": NumberTypeClass, "percent_of_previous": NumberTypeClass, "percent_of_total": NumberTypeClass, "percentile": NumberTypeClass, "percentile_distinct": NumberTypeClass, "running_total": NumberTypeClass, "sum": NumberTypeClass, "sum_distinct": NumberTypeClass, } if native_type in field_type_mapping: type_class = field_type_mapping[native_type] else: self.reporter.report_warning( native_type, f"The type '{native_type}' is not recognised for field type, setting as NullTypeClass.", ) type_class = NullTypeClass data_type = SchemaFieldDataType(type=type_class()) return data_type
def _get_field_type(self, native_type: str) -> SchemaFieldDataType: type_class = field_type_mapping.get(native_type) if type_class is None: # attempt Postgres modified type type_class = resolve_postgres_modified_type(native_type) # if still not found, report a warning if type_class is None: self.reporter.report_warning( native_type, f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.", ) type_class = NullTypeClass data_type = SchemaFieldDataType(type=type_class()) return data_type
def _get_field_type(native_type: str, reporter: SourceReport) -> SchemaFieldDataType: type_class = LookerUtil.field_type_mapping.get(native_type) if type_class is None: # attempt Postgres modified type type_class = resolve_postgres_modified_type(native_type) # if still not found, log and continue if type_class is None: logger.info( f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.", ) type_class = NullTypeClass data_type = SchemaFieldDataType(type=type_class()) return data_type
def _get_column_type(descriptor: DescriptorBase) -> SchemaFieldDataType: native_type: str = _get_simple_native_type(descriptor) type_class: Any if getattr(descriptor, "label", None) == FieldDescriptor.LABEL_REPEATED: type_class = ArrayTypeClass(nestedType=[native_type]) elif getattr(descriptor, "type", None) == FieldDescriptor.TYPE_ENUM: type_class = EnumTypeClass() # # TODO: Find a better way to detect maps # # elif simple_type == "map": # type_class = MapTypeClass( # keyType=descriptor.key_type, # valueType=descriptor.val_type, # ) else: type_class = _native_type_to_typeclass.get(native_type, RecordTypeClass)() return SchemaFieldDataType(type=type_class)
def get_column_type(report: SourceReport, dataset_name: str, column_type: str) -> SchemaFieldDataType: """ Maps known DBT types to datahub types """ TypeClass: Any = _field_type_mapping.get(column_type) if TypeClass is None: # attempt Postgres modified type TypeClass = resolve_postgres_modified_type(column_type) # if still not found, report the warning if TypeClass is None: report.report_warning( dataset_name, f"unable to map type {column_type} to metadata schema") TypeClass = NullTypeClass return SchemaFieldDataType(type=TypeClass())
def _get_schema_metadata_for_datasource( self, datasource_fields: List[dict] ) -> Optional[SchemaMetadata]: fields = [] schema_metadata = None for field in datasource_fields: # check datasource - custom sql relations from a field being referenced self._track_custom_sql_ids(field) nativeDataType = field.get("dataType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description=make_description_from_params( field.get("description", ""), field.get("formula") ), nativeDataType=nativeDataType, globalTags=get_tags_from_params( [ field.get("role", ""), field.get("__typename", ""), field.get("aggregation", ""), ] ) if self.config.ingest_tags else None, ) fields.append(schema_field) if fields: schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) return schema_metadata
def get_column_type(report: SourceReport, dataset_name: str, column_type: str) -> SchemaFieldDataType: """ Maps known Spark types to datahub types """ TypeClass: Any = None for field_type, type_class in _field_type_mapping.items(): if isinstance(column_type, field_type): TypeClass = type_class break # if still not found, report the warning if TypeClass is None: report.report_warning( dataset_name, f"unable to map type {column_type} to metadata schema") TypeClass = NullTypeClass return SchemaFieldDataType(type=TypeClass())
def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: # infer schema of a parquet file without reading the whole file # read the first line of the file schema = pyarrow.parquet.read_schema(file, memory_map=True) fields: List[SchemaField] = [] for name, pyarrow_type in zip(schema.names, schema.types): mapped_type = map_pyarrow_type(pyarrow_type) field = SchemaField( fieldPath=name, type=SchemaFieldDataType(mapped_type()), nativeDataType=str(pyarrow_type), recursive=False, ) fields.append(field) return fields
def get_field_type(self, field_type: Union[Type, str], collection_name: str) -> SchemaFieldDataType: """ Maps types encountered in PyMongo to corresponding schema types. Parameters ---------- field_type: type of a Python object collection_name: name of collection (for logging) """ TypeClass: Optional[Type] = _field_type_mapping.get(field_type) if TypeClass is None: self.report.report_warning( collection_name, f"unable to map type {field_type} to metadata schema") TypeClass = NullTypeClass return SchemaFieldDataType(type=TypeClass())