Exemplo n.º 1
0
 def get_segmentation_predict_data(self, params):  # pylint: disable=R0201
     """
         fetch data for Segmentation
         params = {"subquery": target_query,
                   "feature_columns": feature_columns}
     """
     joined_features = ', '.join(
         ['"{}"::numeric'.format(a) for a in params['feature_columns']])
     query = '''
             SELECT
               Array[{joined_features}] As features
             FROM ({subquery}) as q
             '''.format(subquery=params['subquery'],
                        joined_features=joined_features)
     return plpy.cursor(query)
Exemplo n.º 2
0
def predict_segment(model, features, target_query):
    """
    Use the provided model to predict the values for the new feature set
        Input:
            @param model: The pretrained model
            @features: A list of features to use in the model prediction (list of column names)
            @target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
    """

    batch_size = 1000
    joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])

    try:
        cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
            joined_features=joined_features,
            target_query=target_query))
    except Exception, e:
        plpy.error('Failed to build segmentation model: %s' % e)
Exemplo n.º 3
0
def setup_system_catalog(presto_server, presto_user, presto_catalog, presto_schema, access_role):
    search_path = _get_session_search_path_array()
    if search_path == ['$user', 'public']:
        # search_path is default value.
        plpy.execute("set search_path to %s" % plpy.quote_ident(presto_schema))

    client = presto_client.Client(server=presto_server, user=presto_user, catalog=presto_catalog, schema='default')

    # get table list
    sql = "select table_schema, table_name, column_name, is_nullable, data_type" \
          " from information_schema.columns"
    columns, rows = client.run(sql)
    if rows is None:
        rows = []

    schemas = {}

    for row in rows:
        schema_name = row[0]
        table_name = row[1]
        column_name = row[2]
        is_nullable = row[3]
        column_type = row[4]

        if schema_name == "sys" or schema_name == "information_schema":
            # skip system schemas
            continue

        if len(schema_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1))
            continue

        tables = schemas.setdefault(schema_name, {})

        if len(table_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1))
            continue

        columns = tables.setdefault(table_name, [])

        if len(column_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \
                     plpy.quote_ident(column_name), PG_NAMEDATALEN - 1))
            continue

        columns.append(Column(column_name, column_type, is_nullable))

    # drop all schemas excepting prestogres_catalog, information_schema and pg_%
    sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \
          " where n.nspname not in ('prestogres_catalog', 'information_schema')" \
          " and n.nspname not like 'pg_%'"
    for row in plpy.cursor(sql):
        plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"]))

    # create schema and tables
    for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k):
        try:
            plpy.execute("create schema %s" % (plpy.quote_ident(schema_name)))
        except:
            # ignore error?
            pass

        for table_name, columns in sorted(tables.items(), key=lambda (k,v): k):
            column_names = []
            column_types = []
            not_nulls = []

            if len(columns) >= 1600:
                plpy.warning("Table %s.%s contains more than 1600 columns. Some columns will be inaccessible" % (plpy.quote_ident(schema_name), plpy.quote_ident(table_name)))

            for column in columns[0:1600]:
                column_names.append(column.name)
                column_types.append(_pg_table_type(column.type))
                not_nulls.append(not column.nullable)

            # change columns
            column_names = _rename_duplicated_column_names(column_names,
                    "%s.%s table" % (plpy.quote_ident(schema_name), plpy.quote_ident(table_name)))
            create_sql = _build_create_table(schema_name, table_name, column_names, column_types, not_nulls)
            plpy.execute(create_sql)

        # grant access on the schema to the restricted user so that
        # pg_table_is_visible(reloid) used by \d of psql command returns true
        plpy.execute("grant usage on schema %s to %s" % \
                (plpy.quote_ident(schema_name), plpy.quote_ident(access_role)))
        # this SELECT privilege is unnecessary because queries against those tables
        # won't run on PostgreSQL. causing an exception is good if Prestogres has
        # a bug sending a presto query to PostgreSQL without rewriting.
        # TODO however, it's granted for now because some BI tools might check
        #      has_table_privilege. the best solution is to grant privilege but
        #      actually selecting from those tables causes an exception.
        plpy.execute("grant select on all tables in schema %s to %s" % \
                (plpy.quote_ident(schema_name), plpy.quote_ident(access_role)))

    # fake current_database() to return Presto's catalog name to be compatible with some
    # applications that use db.schema.table syntax to identify a table
    if plpy.execute("select pg_catalog.current_database()")[0].values()[0] != presto_catalog:
        plpy.execute("delete from pg_catalog.pg_proc where proname='current_database'")
        plpy.execute("create function pg_catalog.current_database() returns name as $$begin return %s::name; end$$ language plpgsql stable strict" % \
                plpy.quote_literal(presto_catalog))
Exemplo n.º 4
0
def setup_system_catalog(presto_server, presto_user, presto_catalog, access_role):
    client = presto_client.Client(server=presto_server, user=presto_user, catalog=presto_catalog, schema='default')

    # get table list
    sql = "select table_schema, table_name, column_name, is_nullable, data_type" \
          " from information_schema.columns"
    columns, rows = client.run(sql)
    if rows is None:
        rows = []

    schemas = {}

    for row in rows:
        schema_name = row[0]
        table_name = row[1]
        column_name = row[2]
        is_nullable = row[3]
        column_type = row[4]

        if schema_name == "sys" or schema_name == "information_schema":
            # skip system schemas
            continue

        if len(schema_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1))
            continue

        tables = schemas.setdefault(schema_name, {})

        if len(table_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1))
            continue

        columns = tables.setdefault(table_name, [])

        if len(column_name) > PG_NAMEDATALEN - 1:
            plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \
                    (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \
                     plpy.quote_ident(column_name), PG_NAMEDATALEN - 1))
            continue

        columns.append(Column(column_name, column_type, is_nullable))

    # drop all schemas excepting prestogres_catalog, information_schema and pg_%
    sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \
          " where n.nspname not in ('prestogres_catalog', 'information_schema')" \
          " and n.nspname not like 'pg_%'"
    for row in plpy.cursor(sql):
        plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"]))

    # create schema and tables
    for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k):
        try:
            plpy.execute("create schema %s" % (plpy.quote_ident(schema_name)))
        except:
            # ignore error?
            pass

        # grant access on the all tables to the restricted user
        plpy.execute("grant select on all tables in schema %s to %s" % \
                (plpy.quote_ident(schema_name), plpy.quote_ident(access_role)))

        for table_name, columns in sorted(tables.items(), key=lambda (k,v): k):
            column_names = []
            column_types = []
            not_nulls = []
            for column in columns:
                column_names.append(column.name)
                column_types.append(_pg_table_type(column.type))
                not_nulls.append(not column.nullable)

            # change columns
            create_sql = _build_create_table(schema_name, table_name, column_names, column_types, not_nulls)
            plpy.execute(create_sql)

    # update pg_database
    plpy.execute("update pg_database set datname=%s where datname=current_database()" % \
            plpy.quote_literal(presto_catalog))
Exemplo n.º 5
0
def run_system_catalog_as_temp_table(server, user, catalog, schema, login_user, login_database, result_table, query):
    try:
        client = presto_client.Client(server=server, user=user, catalog=catalog, schema=schema, time_zone=_get_session_time_zone())

        # create SQL statements which put data to system catalogs
        if SchemaCacheEntry.is_cached(server, user, catalog, schema, time.time()):
            schema_names = SchemaCacheEntry.schema_names
            statements = SchemaCacheEntry.statements
            query_cache = SchemaCacheEntry.query_cache

        else:
            # get table list
            sql = "select table_schema, table_name, column_name, is_nullable, data_type" \
                  " from information_schema.columns"
            columns, rows = client.run(sql)

            schemas = {}

            if rows is None:
                rows = []

            for row in rows:
                schema_name = row[0]
                table_name = row[1]
                column_name = row[2]
                is_nullable = row[3]
                column_type = row[4]

                if len(schema_name) > PG_NAMEDATALEN - 1:
                    plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \
                            (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1))
                    continue

                tables = schemas.setdefault(schema_name, {})

                if len(table_name) > PG_NAMEDATALEN - 1:
                    plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \
                            (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1))
                    continue

                columns = tables.setdefault(table_name, [])

                if len(column_name) > PG_NAMEDATALEN - 1:
                    plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \
                            (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \
                             plpy.quote_ident(column_name), PG_NAMEDATALEN - 1))
                    continue

                columns.append(Column(column_name, column_type, is_nullable))

            # generate SQL statements
            statements = []
            schema_names = []

            # create a restricted user using the same name with the login user name to pgpool2
            statements.append(
                    "do $$ begin if not exists (select * from pg_catalog.pg_roles where rolname=%s) then create role %s with login; end if; end $$" % \
                    (plpy.quote_literal(login_user), plpy.quote_ident(login_user)))

            # grant access on the all table holders to the restricted user
            statements.append("grant select on all tables in schema prestogres_catalog to %s" % \
                    plpy.quote_ident(login_user))

            table_holder_id = 0

            for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k):
                if schema_name == "sys" or schema_name == "information_schema":
                    # skip system schemas
                    continue

                schema_names.append(schema_name)

                for table_name, columns in sorted(tables.items(), key=lambda (k,v): k):
                    # table schema
                    column_names = []
                    column_types = []
                    not_nulls = []
                    for column in columns:
                        column_names.append(column.name)
                        column_types.append(_pg_table_type(column.type))
                        not_nulls.append(not column.nullable)

                    # rename table holder into the schema
                    statements.append("alter table prestogres_catalog.table_holder_%d set schema %s" % \
                            (table_holder_id, plpy.quote_ident(schema_name)))
                    statements.append("alter table %s.table_holder_%d rename to %s" % \
                            (plpy.quote_ident(schema_name), table_holder_id, plpy.quote_ident(table_name)))

                    # change columns
                    alter_sql = _build_alter_table_holder_sql(schema_name, table_name, column_names, column_types, not_nulls)
                    statements.append(alter_sql)

                    table_holder_id += 1

            # cache expires after 60 seconds
            SchemaCacheEntry.set_cache(server, user, catalog, schema, schema_names, statements, time.time() + 60)
            query_cache = {}

        query_result = query_cache.get(query)

        if query_result:
            column_names = query_result.column_names
            column_types = query_result.column_types
            result = query_result.result

        else:
            # enter subtransaction to rollback tables right after running the query
            subxact = plpy.subtransaction()
            subxact.enter()
            try:
                # drop all schemas excepting prestogres_catalog, pg_catalog, information_schema, public
                # and schema holders
                sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \
                      " where n.nspname not in ('prestogres_catalog', 'pg_catalog', 'information_schema', 'public')" \
                      " and n.nspname not like 'prestogres_catalog_schema_holder_%'" \
                      " and n.nspname !~ '^pg_toast'"
                for row in plpy.cursor(sql):
                    plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"]))

                # alter schema holders
                schema_holder_id = 0
                for schema_name in schema_names:
                    try:
                        plpy.execute("alter schema prestogres_catalog_schema_holder_%s rename to %s" % \
                                (schema_holder_id, plpy.quote_ident(schema_name)))
                        schema_holder_id += 1
                    except:
                        # ignore error?
                        pass

                # alter table holders in prestogres_catalog schema
                for statement in statements:
                    plpy.execute(statement)

                # drop prestogres_catalog schema
                plpy.execute("drop schema prestogres_catalog cascade")

                # drop schema holders
                sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \
                      " where n.nspname like 'prestogres_catalog_schema_holder_%'"
                for row in plpy.cursor(sql):
                    plpy.execute("drop schema %s" % plpy.quote_ident(row["schema_name"]))

                # update pg_database
                plpy.execute("update pg_database set datname=%s where datname=current_database()" % \
                        plpy.quote_literal(schema_name))

                # switch to the restricted role
                plpy.execute("set role to %s" % plpy.quote_ident(login_user))

                # run the actual query and save result
                metadata = plpy.execute(query)
                column_names = metadata.colnames()
                column_type_oids = metadata.coltypes()
                result = map(lambda row: map(row.get, column_names), metadata)

                # save result schema
                oid_to_type_name = _load_oid_to_type_name_mapping(column_type_oids)
                column_types = map(oid_to_type_name.get, column_type_oids)

                # store query cache
                query_cache[query] = QueryResult(column_names, column_types, result)

            finally:
                # rollback subtransaction
                subxact.exit("rollback subtransaction", None, None)

        column_names = _rename_duplicated_column_names(column_names)
        create_sql = _build_create_temp_table_sql(result_table, column_names, column_types)
        insert_sql = _build_insert_into_sql(result_table, column_names)

        # run CREATE TABLE and INSERT
        plpy.execute("drop table if exists " + plpy.quote_ident(result_table))
        plpy.execute(create_sql)
        _batch_insert(insert_sql, 10, column_types, result)

    except (plpy.SPIError, presto_client.PrestoException) as e:
        # Set __module__ = "__module__" to generate pretty messages.
        e.__class__.__module__ = "__main__"
        raise