def _add_table_index(self, entity: Type[ffd.Entity], field_): ff.retry(lambda: self._exec( f"alter table {self._fqtn(entity)} add column `{field_.name}` {self._db_type(field_)}", [])) ff.retry(lambda: self._exec( f"create index `idx_{field_.name}` on {self._fqtn(entity)} (`{field_.name}`)", []))
def _drop_table_index(self, entity: Type[ffd.Entity], name: str): index = f'`idx_{name}`' ff.retry(lambda: self._exec( f"drop index {index} on {self._fqtn(entity)}", [])) column = f'`{name}`' ff.retry(lambda: self._exec( f"alter table {self._fqtn(entity)} drop column {column}", []))
def _remove(self, entity: ff.Entity): sql = f"delete from {self._fqtn(entity.__class__)} where id = :id" params = [ { 'name': 'id', 'value': { 'stringValue': entity.id_value() } }, ] ff.retry(self._exec(sql, params))
def _paginate(self, sql: str, params: list, entity: Type[ff.Entity]): if entity.__name__ not in self._select_limits: self._select_limits[entity.__name__] = self._get_average_row_size( entity) limit = floor(self._size_limit / self._select_limits[entity.__name__]) offset = 0 ret = [] while True: try: result = ff.retry( lambda: self._exec(f'{sql} limit {limit} offset {offset}', params), should_retry=lambda err: 'Database returned more than the allowed response size limit' not in str(err)) except ClientError as e: if 'Database returned more than the allowed response size limit' in str( e) and limit > 10: limit = floor(limit / 2) self._select_limits[entity.__name__] = limit continue raise e for row in result['records']: obj = self._serializer.deserialize(row[0]['stringValue']) ret.append(entity.from_dict(obj)) if len(result['records']) < limit: break offset += limit return ret
def _get_average_row_size(self, entity: Type[ff.Entity]): result = ff.retry(lambda: self._exec( f"select CEIL(AVG(LENGTH(obj))) from {self._fqtn(entity)}", [])) try: return result['records'][0][0]['longValue'] / 1024 except KeyError: return 1
def _load_query_results(self, sql: str, params: list, limit: int, offset: int): return ff.retry( lambda: self._exec(f'{sql} limit {limit} offset {offset}', params), should_retry=lambda err: 'Database returned more than the allowed response size limit' not in str(err))['records']
def _fetch_multiple_large_documents(self, sql: str, params: list, entity: Type[ff.Entity]): ret = [] q = self._identifier_quote_char sql = sql.replace(f'select {q}document{q}', 'select id') result = ff.retry(lambda: self._execute(sql, params)) for row in result: ret.append(self._fetch_large_document(row['id'], entity)) return ret
def _fetch_multiple_large_documents(self, sql: str, params: list, entity: Type[ff.Entity]): ret = [] sql = sql.replace('select obj', 'select id') result = ff.retry(lambda: self._exec(sql, params)) for row in result['records']: ret.append( self._fetch_large_document(row[0]['stringValue'], entity)) return ret
def _find(self, uuid: str, entity_type: Type[ff.Entity]): sql = f"select {self._generate_select_list(entity_type)} from {self._fqtn(entity_type)} where id = :id" params = [{'name': 'id', 'value': {'stringValue': uuid}}] result = ff.retry( lambda: self._exec(sql, params), should_retry=lambda err: 'Database returned more than the allowed response size limit' not in str(err)) if len(result['records']) == 0: return None return self._build_entity(entity_type, result['records'][0])
def _insert_large_document(self, entity: ff.Entity, update: bool = False): obj = self._serializer.serialize(entity) n = self._size_limit * 1024 first = True for chunk in [obj[i:i + n] for i in range(0, len(obj), n)]: if first: if update: ff.retry(lambda: self._exec(*self._generate_update( entity, part=chunk))) else: ff.retry(lambda: self._exec(*self._generate_insert( entity, part=chunk))) first = False else: sql = f"update {self._fqtn(entity.__class__)} set obj = CONCAT(obj, :str) where id = :id" params = [ { 'name': 'id', 'value': { 'stringValue': entity.id_value() } }, { 'name': 'str', 'value': { 'stringValue': chunk } }, ] ff.retry(lambda: self._exec(sql, params))
def _get_average_row_size(self, entity: Type[ff.Entity]): schema, table = self._fqtn(entity).split('.') sql = f""" select avg_row_length from information_schema.tables where table_schema = '{schema}' and table_name = '{table}' """ result = ff.retry(lambda: self._execute(sql)) try: return result[0]['AVG_ROW_LENGTH'] / 1024 except KeyError: return 1
def _get_table_indexes(self, entity: Type[ffd.Entity]): schema, table = self._fqtn(entity).split('.') sql = f""" select COLUMN_NAME from information_schema.STATISTICS where TABLE_NAME = '{table}' and TABLE_SCHEMA = '{schema}' and INDEX_NAME != 'PRIMARY' """ result = ff.retry(lambda: self._exec(sql, [])) ret = [] for row in result['records']: ret.append(row[0]['stringValue']) return ret
def __call__(self, sql: str, table: domain.Table = None, output_file: str = None, cache_seconds: int = None) -> Optional[pd.DataFrame]: # This is temporary code to get warehouse queries working. This uses athena. We either need to move this # code, specifically the aws wrangler part, to an infrastructure class or finish the original approach using # lambda. Also, the database name is assumed here, and it shouldn't be. self._sql_parser.parse(sql) if table is None: table: domain.Table = self._catalog_registry.get_table( self._sql_parser.get_table()) params = { 'sql': sql, 'database': f'data_warehouse_{self._ff_environment}', 'ctas_approach': False, 'use_threads': True, } if cache_seconds is not None: params['max_cache_seconds'] = cache_seconds results = ff.retry(lambda: wr.athena.read_sql_query( sql=sql, database=f'data_warehouse_{self._ff_environment}', ctas_approach=False, use_threads=True)) try: self._remove_duplicates(results, table) self._sort(results) except KeyError: pass if output_file is not None: if not output_file.startswith('s3://'): output_file = f's3://{output_file}' for column in table.columns: if column.data_type in (date, datetime) and column.name in results: results[column.name] = results[column.name].apply( lambda x: x if x is None else x.isoformat()) wr.s3.to_json(df=results, path=output_file, use_threads=True) else: return results
def _invoke_lambda(self, message: Union[Command, Query]): if hasattr(message, '_async') and getattr(message, '_async') is True: return self._enqueue_message(message) try: response = ff.retry( lambda: self._lambda_client.invoke( FunctionName=f'{self._service_name(message.get_context())}Sync', InvocationType='RequestResponse', LogType='None', Payload=self._serializer.serialize(message) ), wait=2 ) except ClientError as e: raise ff.MessageBusError(str(e)) ret = self._serializer.deserialize(response['Payload'].read().decode('utf-8')) if isinstance(ret, dict) and 'PAYLOAD_KEY' in ret: ret = self._load_payload(ret['PAYLOAD_KEY']) return ret
def _drop_table_index(self, entity: Type[ffd.Entity], name: str): ff.retry(lambda: self._exec( f"drop index `idx_{name}` on {self._fqtn(entity)}", [])) ff.retry(lambda: self._exec( f"alter table {self._fqtn(entity)} drop column `{name}`", []))
def _add(self, entity: ff.Entity): sql, params = self._generate_insert(entity) ff.retry(lambda: self._exec(sql, params))
def _update(self, entity: ff.Entity): sql, params = self._generate_update(entity) ff.retry(lambda: self._exec(sql, params))
def _get_result_count(self, sql: str, params: list): count_sql = f"select count(1) as c from ({sql}) a" result = ff.retry(lambda: self._execute(count_sql, params)) return result[0]['c']
def _get_result_count(self, sql: str, params: list): count_sql = f"select count(*) from ({sql}) a" result = ff.retry(lambda: self._exec(count_sql, params)) return result['records'][0][0]['longValue']