Exemplo n.º 1
0
 def _add_table_index(self, entity: Type[ffd.Entity], field_):
     ff.retry(lambda: self._exec(
         f"alter table {self._fqtn(entity)} add column `{field_.name}` {self._db_type(field_)}",
         []))
     ff.retry(lambda: self._exec(
         f"create index `idx_{field_.name}` on {self._fqtn(entity)} (`{field_.name}`)",
         []))
Exemplo n.º 2
0
 def _drop_table_index(self, entity: Type[ffd.Entity], name: str):
     index = f'`idx_{name}`'
     ff.retry(lambda: self._exec(
         f"drop index {index} on {self._fqtn(entity)}", []))
     column = f'`{name}`'
     ff.retry(lambda: self._exec(
         f"alter table {self._fqtn(entity)} drop column {column}", []))
Exemplo n.º 3
0
 def _remove(self, entity: ff.Entity):
     sql = f"delete from {self._fqtn(entity.__class__)} where id = :id"
     params = [
         {
             'name': 'id',
             'value': {
                 'stringValue': entity.id_value()
             }
         },
     ]
     ff.retry(self._exec(sql, params))
Exemplo n.º 4
0
    def _paginate(self, sql: str, params: list, entity: Type[ff.Entity]):
        if entity.__name__ not in self._select_limits:
            self._select_limits[entity.__name__] = self._get_average_row_size(
                entity)
        limit = floor(self._size_limit / self._select_limits[entity.__name__])
        offset = 0

        ret = []
        while True:
            try:
                result = ff.retry(
                    lambda: self._exec(f'{sql} limit {limit} offset {offset}',
                                       params),
                    should_retry=lambda err:
                    'Database returned more than the allowed response size limit'
                    not in str(err))
            except ClientError as e:
                if 'Database returned more than the allowed response size limit' in str(
                        e) and limit > 10:
                    limit = floor(limit / 2)
                    self._select_limits[entity.__name__] = limit
                    continue
                raise e

            for row in result['records']:
                obj = self._serializer.deserialize(row[0]['stringValue'])
                ret.append(entity.from_dict(obj))
            if len(result['records']) < limit:
                break
            offset += limit

        return ret
Exemplo n.º 5
0
 def _get_average_row_size(self, entity: Type[ff.Entity]):
     result = ff.retry(lambda: self._exec(
         f"select CEIL(AVG(LENGTH(obj))) from {self._fqtn(entity)}", []))
     try:
         return result['records'][0][0]['longValue'] / 1024
     except KeyError:
         return 1
Exemplo n.º 6
0
 def _load_query_results(self, sql: str, params: list, limit: int,
                         offset: int):
     return ff.retry(
         lambda: self._exec(f'{sql} limit {limit} offset {offset}', params),
         should_retry=lambda err:
         'Database returned more than the allowed response size limit'
         not in str(err))['records']
 def _fetch_multiple_large_documents(self, sql: str, params: list, entity: Type[ff.Entity]):
     ret = []
     q = self._identifier_quote_char
     sql = sql.replace(f'select {q}document{q}', 'select id')
     result = ff.retry(lambda: self._execute(sql, params))
     for row in result:
         ret.append(self._fetch_large_document(row['id'], entity))
     return ret
Exemplo n.º 8
0
 def _fetch_multiple_large_documents(self, sql: str, params: list,
                                     entity: Type[ff.Entity]):
     ret = []
     sql = sql.replace('select obj', 'select id')
     result = ff.retry(lambda: self._exec(sql, params))
     for row in result['records']:
         ret.append(
             self._fetch_large_document(row[0]['stringValue'], entity))
     return ret
Exemplo n.º 9
0
    def _find(self, uuid: str, entity_type: Type[ff.Entity]):
        sql = f"select {self._generate_select_list(entity_type)} from {self._fqtn(entity_type)} where id = :id"
        params = [{'name': 'id', 'value': {'stringValue': uuid}}]
        result = ff.retry(
            lambda: self._exec(sql, params),
            should_retry=lambda err:
            'Database returned more than the allowed response size limit'
            not in str(err))
        if len(result['records']) == 0:
            return None

        return self._build_entity(entity_type, result['records'][0])
Exemplo n.º 10
0
 def _insert_large_document(self, entity: ff.Entity, update: bool = False):
     obj = self._serializer.serialize(entity)
     n = self._size_limit * 1024
     first = True
     for chunk in [obj[i:i + n] for i in range(0, len(obj), n)]:
         if first:
             if update:
                 ff.retry(lambda: self._exec(*self._generate_update(
                     entity, part=chunk)))
             else:
                 ff.retry(lambda: self._exec(*self._generate_insert(
                     entity, part=chunk)))
             first = False
         else:
             sql = f"update {self._fqtn(entity.__class__)} set obj = CONCAT(obj, :str) where id = :id"
             params = [
                 {
                     'name': 'id',
                     'value': {
                         'stringValue': entity.id_value()
                     }
                 },
                 {
                     'name': 'str',
                     'value': {
                         'stringValue': chunk
                     }
                 },
             ]
             ff.retry(lambda: self._exec(sql, params))
 def _get_average_row_size(self, entity: Type[ff.Entity]):
     schema, table = self._fqtn(entity).split('.')
     sql = f"""
         select avg_row_length
         from information_schema.tables
         where table_schema = '{schema}'
         and table_name = '{table}'
     """
     result = ff.retry(lambda: self._execute(sql))
     try:
         return result[0]['AVG_ROW_LENGTH'] / 1024
     except KeyError:
         return 1
Exemplo n.º 12
0
    def _get_table_indexes(self, entity: Type[ffd.Entity]):
        schema, table = self._fqtn(entity).split('.')
        sql = f"""
            select COLUMN_NAME
            from information_schema.STATISTICS
            where TABLE_NAME = '{table}'
            and TABLE_SCHEMA = '{schema}'
            and INDEX_NAME != 'PRIMARY'
        """
        result = ff.retry(lambda: self._exec(sql, []))

        ret = []
        for row in result['records']:
            ret.append(row[0]['stringValue'])

        return ret
Exemplo n.º 13
0
    def __call__(self,
                 sql: str,
                 table: domain.Table = None,
                 output_file: str = None,
                 cache_seconds: int = None) -> Optional[pd.DataFrame]:
        # This is temporary code to get warehouse queries working. This uses athena. We either need to move this
        # code, specifically the aws wrangler part, to an infrastructure class or finish the original approach using
        # lambda. Also, the database name is assumed here, and it shouldn't be.
        self._sql_parser.parse(sql)
        if table is None:
            table: domain.Table = self._catalog_registry.get_table(
                self._sql_parser.get_table())

        params = {
            'sql': sql,
            'database': f'data_warehouse_{self._ff_environment}',
            'ctas_approach': False,
            'use_threads': True,
        }

        if cache_seconds is not None:
            params['max_cache_seconds'] = cache_seconds

        results = ff.retry(lambda: wr.athena.read_sql_query(
            sql=sql,
            database=f'data_warehouse_{self._ff_environment}',
            ctas_approach=False,
            use_threads=True))

        try:
            self._remove_duplicates(results, table)
            self._sort(results)
        except KeyError:
            pass

        if output_file is not None:
            if not output_file.startswith('s3://'):
                output_file = f's3://{output_file}'
            for column in table.columns:
                if column.data_type in (date,
                                        datetime) and column.name in results:
                    results[column.name] = results[column.name].apply(
                        lambda x: x if x is None else x.isoformat())
            wr.s3.to_json(df=results, path=output_file, use_threads=True)
        else:
            return results
    def _invoke_lambda(self, message: Union[Command, Query]):
        if hasattr(message, '_async') and getattr(message, '_async') is True:
            return self._enqueue_message(message)

        try:
            response = ff.retry(
                lambda: self._lambda_client.invoke(
                    FunctionName=f'{self._service_name(message.get_context())}Sync',
                    InvocationType='RequestResponse',
                    LogType='None',
                    Payload=self._serializer.serialize(message)
                ),
                wait=2
            )
        except ClientError as e:
            raise ff.MessageBusError(str(e))

        ret = self._serializer.deserialize(response['Payload'].read().decode('utf-8'))
        if isinstance(ret, dict) and 'PAYLOAD_KEY' in ret:
            ret = self._load_payload(ret['PAYLOAD_KEY'])

        return ret
Exemplo n.º 15
0
 def _drop_table_index(self, entity: Type[ffd.Entity], name: str):
     ff.retry(lambda: self._exec(
         f"drop index `idx_{name}` on {self._fqtn(entity)}", []))
     ff.retry(lambda: self._exec(
         f"alter table {self._fqtn(entity)} drop column `{name}`", []))
Exemplo n.º 16
0
 def _add(self, entity: ff.Entity):
     sql, params = self._generate_insert(entity)
     ff.retry(lambda: self._exec(sql, params))
Exemplo n.º 17
0
 def _update(self, entity: ff.Entity):
     sql, params = self._generate_update(entity)
     ff.retry(lambda: self._exec(sql, params))
 def _get_result_count(self, sql: str, params: list):
     count_sql = f"select count(1) as c from ({sql}) a"
     result = ff.retry(lambda: self._execute(count_sql, params))
     return result[0]['c']
Exemplo n.º 19
0
 def _get_result_count(self, sql: str, params: list):
     count_sql = f"select count(*) from ({sql}) a"
     result = ff.retry(lambda: self._exec(count_sql, params))
     return result['records'][0][0]['longValue']