def run(self) -> List[SchemaContent]: db_list = self.inspection.get_schema_names() update_tables = self.get_update_tables(db_list) schemas = [] for db, tb in update_tables: schema = SchemaContent(name=tb, database=db, comment=self.get_table_comment(tb, db), type=self.db_type) columns = self.inspection.get_columns(table_name=self._warp(tb), schema=self._warp(db)) self.set_primary_info(schema, columns, db, tb) fields = [] for x in columns: field = SchemaField(name=x['name'], type=self.convert_flink_type(x['type']), nullable=x['nullable'], autoincrement=x.get('autoincrement')) if field.type is None: logger.error( "Not Add Column {} in {}.{} current not support : {}".format(field.name, schema.database, schema.name, str(x['type']))) else: fields.append(field) schema.fields.extend(fields) schemas.append(schema) return schemas
def run(self) -> List[SchemaContent]: assert 'elasticsearch' == self.db_type from elasticsearch import Elasticsearch es = Elasticsearch(hosts=self.connection_url) schemas = [] for index in es.indices.get_alias('*'): if index not in self.need_tables: continue schema = SchemaContent(name=index, type=self.db_type) mappings = es.indices.get_mapping(index) fields = [] for k, v in mappings[index]['mappings']['properties'].items(): field = SchemaField(name=k, type=self._es2flink(v['type']), nullable=True) if field.type is None: logger.error( "Not Add Column {} in {}.{} current not support : {}".format(field.name, schema.database, schema.name, str(v['type']))) else: fields.append(field) fields.sort(key=lambda x: x.name) schema.fields.extend(fields) print(schema) schemas.append(schema) return schemas
def to_schema_content(self) -> SchemaContent: origin_dict = self.as_dict() use = ['name', 'database', 'comment', 'partitionable'] origin = {k: v for k, v in origin_dict.items() if k in use} origin['fields'] = [SchemaField( **x) for x in load_yaml(self.fields)] if self.fields else [] origin['type'] = self.connection.type.code return SchemaContent(**origin)
def rowtime_field(self) -> Optional[SchemaField]: if self.get_config('rowtime_enable', typ=bool): row_field = { "timestamps": { "type": "from-field", "from": self.get_config('rowtime_from', typ=str) }, "watermarks": { "type": "periodic-bounded", "delay": self.get_config("rowtime_watermarks", typ=int) } } return SchemaField(name=self.get_config('rowtime_name', typ=str), type=BlinkSQLType.TIMESTAMP, rowtime=row_field)
def generate_table_cache(self, config: VersionConfig, connection: Connection, connection_type: str, resource_name: ResourceName, schema: SchemaEvent, template: ResourceTemplate, template_type: str, version: 'ResourceVersion') -> dict: res = dict() if config.update_mode: res['update-mode'] = config.update_mode if config.format: res['format'] = config.format elif connection_type not in FlinkConnectorType.schema_less(): if resource_name.save_format == FlinkSaveFormat.json: res['format'] = {"type": 'json'} elif resource_name.save_format == FlinkSaveFormat.csv: res['format'] = {"type": 'csv'} else: msg = "Not Support Save format: {} in {}".format( resource_name.save_format, resource_name.full_name) raise NotImplementedError(msg) connector = self.generate_table_connector(connection, connection_type, resource_name, schema, template, version) res['connector'] = connector if connector else None fields = [SchemaField(**x) for x in load_yaml(schema.fields)] if schema.fields else [] need_fields = [ x for x in fields if x.name in NameFilter(config.include, config.exclude) ] field_names = [x.name for x in need_fields] schemas = [] if config.schema: for x in config.schema: assert x[ 'name'] not in field_names, "{} contain in origin field" schemas.extend(config.schema) for x in need_fields: if self.is_filter_field(x, connection_type, template_type, resource_name): continue schemas.append(self.field2schema(x)) res['schema'] = schemas return res
def _generate_update_fields(self, schema_event: SchemaEvent, config: VersionConfig) -> List[SchemaField]: cnt = self._connector fields = [] config.exclude = '.*' schema_fields = [ SchemaField(**x) for x in load_yaml(schema_event.fields) ] if schema_event else [] for schema in schema_fields: for suffix, tp in zip([ cnt.before_column_suffix, cnt.after_column_suffix, cnt.update_suffix ], [schema.type, schema.type, BlinkSQLType.BOOLEAN]): n_schema = deepcopy(schema) n_schema.type = tp n_schema.name = n_schema.name + suffix fields.append(n_schema) return fields
def run(self) -> List[SchemaContent]: assert 'hbase' == self.db_type import happybase host, port = self.connection_url.split(':') connection = happybase.Connection(host, int(port), autoconnect=True) schemas = [] for x in connection.tables(): tab = x.decode() table = connection.table(tab) schema = SchemaContent(name=tab, type=self.db_type) fields = [] for fm in table.families(): fields.append( SchemaField(name=fm.decode(), type=BlinkHiveSQLType.BYTES, nullable=True)) schema.fields.extend(fields) schemas.append(schema) return schemas
def db_execute_time_field(self) -> Optional[SchemaField]: if not self.get_config('rowtime_enable', typ=bool): return SchemaField(name=self.get_config('rowtime_from', typ=str), type=BlinkSQLType.TIMESTAMP)
def binlog_type_name_field(self) -> SchemaField: return SchemaField(name=self.get_config('binlog_type_name', typ=str), type=BlinkSQLType.INTEGER)
def process_time_field(self) -> Optional[SchemaField]: if self.get_config('process_time_enable', typ=bool): return SchemaField(name=self.get_config('process_time_name', typ=str), type=BlinkSQLType.TIMESTAMP, proctime=True)