def _send_to_plpy(self, level, text, exception=None): # exception might also be a tuple generated by sys.exc_info if exception: if isinstance(exception, tuple) and len(exception) > 1: exception = exception[1] exception_message = '. Exception: {}'.format(exception) else: exception_message = '' # Adding trace breaks tests # trace = traceback.format_exc(15) # message = '{}{}. Trace: {}'.format(text, exception_message, trace) message = '{}{}'.format(text, exception_message) if self._check_plpy(): if level == 'debug': plpy.debug(message) elif level == 'info': plpy.info(message) elif level == 'warning': plpy.warning(message) elif level == 'error': # Plpy.error and fatal raises exceptions and we only want to # log an error, exceptions should be raise explicitly plpy.warning(message)
def isconnected(transfos, doubletransfo=False): """ Check if transfos list corresponds to a connected graph """ success = True edges = {} # check connectivity # getting sources and targets for each transformation tlist = ['{},{}'.format(i, r) for i, r in enumerate(transfos)] vals = '({})'.format('),('.join(tlist)) rv = plpy.execute( """ select id, source, target from (values {}) as v join li3ds.transfo t on v.column2 = t.id order by v.column1 """.format(vals) ) transfoset = set([(r['source'], r['target']) for r in rv]) if not doubletransfo and len(transfoset) != len(rv): # multiple edges between source and target return False # fill the edges for later use for tra in rv: edges[tra['id']] = (tra['source'], tra['target']) # check connexity neighbors = defaultdict(set) # store referentials (nodes) nodes = set(chain.from_iterable(edges.values())) for tra, refs in edges.items(): neighbors[refs[0]].update({refs[1]}) neighbors[refs[1]].update({refs[0]}) visited_nodes = {} start_node = list(nodes)[0] queue = deque() queue.append(start_node) visited_nodes[start_node] = True while queue: node = queue.popleft() for child in neighbors[node]: if child not in visited_nodes: visited_nodes[child] = True queue.append(child) diff = len(visited_nodes) - len(nodes) if diff: success = False plpy.warning( 'disconnected graph, visited nodes {}, total {}' .format(len(visited_nodes), len(nodes)) ) return success
def _send_to_rollbar(self, level, text, exception, data): if self._rollbar_activated(): try: if exception: rollbar.report_exc_info(sys.exc_info(), extra_data=data, level=level) else: rollbar.report_message(text, level, extra_data=data) except Exception as e: plpy.warning('Error sending message/exception to rollbar: {0}'. format(e))
def _send_to_plpy(self, level, text): if self._check_plpy(): if level == 'debug': plpy.debug(text) elif level == 'info': plpy.info(text) elif level == 'warning': plpy.warning(text) elif level == 'error': # Plpy.error and fatal raises exceptions and we only want to # log an error, exceptions should be raise explicitly plpy.warning(text)
def get_dyn_transfo_params_form_2(params, time): ''' Return the dynamic transfo parameters. ''' _times = (p['_time'] for p in params) if isinstance(time, datetime.datetime): _times = map(dateutil.parser.parse, _times) _times = list(_times) # find leftmost value greather than or equal to time i = bisect.bisect_left(_times, time) if i == len(_times) or (i == 0 and time < _times[0]): plpy.warning('no parameters for the provided time ({})'.format(time.isoformat())) return None return params[i]
def _rename_duplicated_column_names(column_names): renamed = [] used_names = set() for original_name in column_names: name = original_name while name in used_names: name += "_" if name != original_name: plpy.warning("Result column %s is renamed to %s because the name appears twice in a query result" % \ (plpy.quote_ident(original_name), plpy.quote_ident(name))) used_names.add(name) renamed.append(name) return renamed
def country_to_iso3(country): """ Convert country to its iso3 code """ try: country_plan = plpy.prepare("SELECT adm0_a3 as iso3 FROM admin0_synonyms WHERE lower(regexp_replace($1, " \ "'[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text = name_; ", ['text']) country_result = plpy.execute(country_plan, [country], 1) if country_result: return country_result[0]['iso3'] else: return None except BaseException as e: plpy.warning("Can't get the iso3 code from {0}: {1}".format(country, e)) return None
def country_to_iso3(country): """ Convert country to its iso3 code """ try: country_plan = plpy.prepare("SELECT adm0_a3 as iso3 FROM admin0_synonyms WHERE lower(regexp_replace($1, " \ "'[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text = name_; ", ['text']) country_result = plpy.execute(country_plan, [country], 1) if country_result: return country_result[0]['iso3'] else: return None except BaseException as e: plpy.warning("Can't get the iso3 code from {0}: {1}".format( country, e)) return None
def coordinates_to_polygon(coordinates): """Convert a Mapzen coordinates to a PostGIS polygon""" result_coordinates = [] for coordinate in coordinates: result_coordinates.append("%s %s" % (coordinate[0], coordinate[1])) wkt_coordinates = ','.join(result_coordinates) try: sql = "SELECT ST_MakePolygon(ST_GeomFromText('LINESTRING({0})', 4326)) as geom".format(wkt_coordinates) geometry = plpy.execute(sql, 1)[0]['geom'] except BaseException as e: plpy.warning("Can't generate POLYGON from coordinates: {0}".format(e)) geometry = None return geometry
def coordinates_to_polygon(coordinates): """Convert a Mapzen coordinates to a PostGIS polygon""" result_coordinates = [] for coordinate in coordinates: result_coordinates.append("%s %s" % (coordinate[0], coordinate[1])) wkt_coordinates = ','.join(result_coordinates) try: sql = "SELECT st_multi(ST_CollectionExtract(ST_MakeValid(ST_MakePolygon(ST_GeomFromText('LINESTRING({0})', 4326))),3)) as geom".format(wkt_coordinates) geometry = plpy.execute(sql, 1)[0]['geom'] except BaseException as e: plpy.warning("Can't generate POLYGON from coordinates: {0}".format(e)) geometry = None return geometry
def polyline_to_linestring(polyline): """Convert a Mapzen polyline shape to a PostGIS linestring""" coordinates = [] for point in polyline: coordinates.append("%s %s" % (point[1], point[0])) wkt_coordinates = ','.join(coordinates) try: sql = "SELECT ST_GeomFromText('LINESTRING({0})', 4326) as geom".format( wkt_coordinates) geometry = plpy.execute(sql, 1)[0]['geom'] except BaseException as e: plpy.warning("Can't generate LINESTRING from polyline: {0}".format(e)) geometry = None return geometry
def get_dyn_transfo_params_form_1(params_column, params, time): ''' Return the dynamic transfo parameters. ''' if isinstance(time, datetime.datetime): plpy.error('times as strings unsupported for dynamic transforms of form 1') schema, table, column = tuple(map(plpy.quote_ident, params_column.split('.'))) params = params[0] select = [] for param in params.values(): if isinstance(param, list): for dim in param: append_dim_select(dim, select) else: dim = param append_dim_select(dim, select) select = ', '.join(select) q = (''' with patch as ( select pc_interpolate({column}, 'time', {time:f}, true) point from {schema}.{table} where pc_patchmin({column}, 'time') <= {time:f} and pc_patchmax({column}, 'time') > {time:f} ) select %s from patch ''' % select).format(schema=schema, table=table, column=column, time=time) plpy.debug(q) rv = plpy.execute(q) if len(rv) == 0: plpy.warning('no parameters for the provided time ({:f})'.format(time)) return None if len(rv) != 1: plpy.error('multiple rows returned from time interpolation') values = rv[0] for key, param in params.items(): if isinstance(param, list): for i, dim in enumerate(param): val = values[dim] param[i] = val else: dim = param val = values[dim] params[key] = val return params
def polyline_to_linestring(polyline): """Convert a Mapzen polyline shape to a PostGIS multipolygon""" coordinates = [] for point in polyline: # Divide by 10 because mapzen uses one more decimal than the # google standard (https://mapzen.com/documentation/turn-by-turn/decoding/) coordinates.append("%s %s" % (point[1]/10, point[0]/10)) wkt_coordinates = ','.join(coordinates) try: sql = "SELECT ST_GeomFromText('LINESTRING({0})', 4326) as geom".format(wkt_coordinates) geometry = plpy.execute(sql, 1)[0]['geom'] except BaseException as e: plpy.warning("Can't generate LINESTRING from polyline: {0}".format(e)) geometry = None return geometry
def _rename_duplicated_column_names(column_names, where): renamed = [] used_names = copy(SYSTEM_COLUMN_NAMES) for original_name in column_names: name = original_name while name in used_names: name += "_" if name != original_name: if name in SYSTEM_COLUMN_NAMES: plpy.warning("Column %s is renamed to %s because the name in %s conflicts with PostgreSQL system column names" % \ (plpy.quote_ident(original_name), plpy.quote_ident(name), where)) else: plpy.warning("Column %s is renamed to %s because the name appears twice in %s" % \ (plpy.quote_ident(original_name), plpy.quote_ident(name), where)) used_names.add(name) renamed.append(name) return renamed
def run_system_catalog_as_temp_table(server, user, catalog, schema, login_user, login_database, result_table, query): try: client = presto_client.Client(server=server, user=user, catalog=catalog, schema=schema, time_zone=_get_session_time_zone()) # create SQL statements which put data to system catalogs if SchemaCacheEntry.is_cached(server, user, catalog, schema, time.time()): schema_names = SchemaCacheEntry.schema_names statements = SchemaCacheEntry.statements query_cache = SchemaCacheEntry.query_cache else: # get table list sql = "select table_schema, table_name, column_name, is_nullable, data_type" \ " from information_schema.columns" columns, rows = client.run(sql) schemas = {} if rows is None: rows = [] for row in rows: schema_name = row[0] table_name = row[1] column_name = row[2] is_nullable = row[3] column_type = row[4] if len(schema_name) > PG_NAMEDATALEN - 1: plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1)) continue tables = schemas.setdefault(schema_name, {}) if len(table_name) > PG_NAMEDATALEN - 1: plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1)) continue columns = tables.setdefault(table_name, []) if len(column_name) > PG_NAMEDATALEN - 1: plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \ plpy.quote_ident(column_name), PG_NAMEDATALEN - 1)) continue columns.append(Column(column_name, column_type, is_nullable)) # generate SQL statements statements = [] schema_names = [] # create a restricted user using the same name with the login user name to pgpool2 statements.append( "do $$ begin if not exists (select * from pg_catalog.pg_roles where rolname=%s) then create role %s with login; end if; end $$" % \ (plpy.quote_literal(login_user), plpy.quote_ident(login_user))) # grant access on the all table holders to the restricted user statements.append("grant select on all tables in schema prestogres_catalog to %s" % \ plpy.quote_ident(login_user)) table_holder_id = 0 for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k): if schema_name == "sys" or schema_name == "information_schema": # skip system schemas continue schema_names.append(schema_name) for table_name, columns in sorted(tables.items(), key=lambda (k,v): k): # table schema column_names = [] column_types = [] not_nulls = [] for column in columns: column_names.append(column.name) column_types.append(_pg_table_type(column.type)) not_nulls.append(not column.nullable) # rename table holder into the schema statements.append("alter table prestogres_catalog.table_holder_%d set schema %s" % \ (table_holder_id, plpy.quote_ident(schema_name))) statements.append("alter table %s.table_holder_%d rename to %s" % \ (plpy.quote_ident(schema_name), table_holder_id, plpy.quote_ident(table_name))) # change columns alter_sql = _build_alter_table_holder_sql(schema_name, table_name, column_names, column_types, not_nulls) statements.append(alter_sql) table_holder_id += 1 # cache expires after 60 seconds SchemaCacheEntry.set_cache(server, user, catalog, schema, schema_names, statements, time.time() + 60) query_cache = {} query_result = query_cache.get(query) if query_result: column_names = query_result.column_names column_types = query_result.column_types result = query_result.result else: # enter subtransaction to rollback tables right after running the query subxact = plpy.subtransaction() subxact.enter() try: # drop all schemas excepting prestogres_catalog, pg_catalog, information_schema, public # and schema holders sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \ " where n.nspname not in ('prestogres_catalog', 'pg_catalog', 'information_schema', 'public')" \ " and n.nspname not like 'prestogres_catalog_schema_holder_%'" \ " and n.nspname !~ '^pg_toast'" for row in plpy.cursor(sql): plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"])) # alter schema holders schema_holder_id = 0 for schema_name in schema_names: try: plpy.execute("alter schema prestogres_catalog_schema_holder_%s rename to %s" % \ (schema_holder_id, plpy.quote_ident(schema_name))) schema_holder_id += 1 except: # ignore error? pass # alter table holders in prestogres_catalog schema for statement in statements: plpy.execute(statement) # drop prestogres_catalog schema plpy.execute("drop schema prestogres_catalog cascade") # drop schema holders sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \ " where n.nspname like 'prestogres_catalog_schema_holder_%'" for row in plpy.cursor(sql): plpy.execute("drop schema %s" % plpy.quote_ident(row["schema_name"])) # update pg_database plpy.execute("update pg_database set datname=%s where datname=current_database()" % \ plpy.quote_literal(schema_name)) # switch to the restricted role plpy.execute("set role to %s" % plpy.quote_ident(login_user)) # run the actual query and save result metadata = plpy.execute(query) column_names = metadata.colnames() column_type_oids = metadata.coltypes() result = map(lambda row: map(row.get, column_names), metadata) # save result schema oid_to_type_name = _load_oid_to_type_name_mapping(column_type_oids) column_types = map(oid_to_type_name.get, column_type_oids) # store query cache query_cache[query] = QueryResult(column_names, column_types, result) finally: # rollback subtransaction subxact.exit("rollback subtransaction", None, None) column_names = _rename_duplicated_column_names(column_names) create_sql = _build_create_temp_table_sql(result_table, column_names, column_types) insert_sql = _build_insert_into_sql(result_table, column_names) # run CREATE TABLE and INSERT plpy.execute("drop table if exists " + plpy.quote_ident(result_table)) plpy.execute(create_sql) _batch_insert(insert_sql, 10, column_types, result) except (plpy.SPIError, presto_client.PrestoException) as e: # Set __module__ = "__module__" to generate pretty messages. e.__class__.__module__ = "__main__" raise
def setup_system_catalog(presto_server, presto_user, presto_catalog, access_role): client = presto_client.Client(server=presto_server, user=presto_user, catalog=presto_catalog, schema='default') # get table list sql = "select table_schema, table_name, column_name, is_nullable, data_type" \ " from information_schema.columns" columns, rows = client.run(sql) if rows is None: rows = [] schemas = {} for row in rows: schema_name = row[0] table_name = row[1] column_name = row[2] is_nullable = row[3] column_type = row[4] if schema_name == "sys" or schema_name == "information_schema": # skip system schemas continue if len(schema_name) > PG_NAMEDATALEN - 1: plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1)) continue tables = schemas.setdefault(schema_name, {}) if len(table_name) > PG_NAMEDATALEN - 1: plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1)) continue columns = tables.setdefault(table_name, []) if len(column_name) > PG_NAMEDATALEN - 1: plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \ plpy.quote_ident(column_name), PG_NAMEDATALEN - 1)) continue columns.append(Column(column_name, column_type, is_nullable)) # drop all schemas excepting prestogres_catalog, information_schema and pg_% sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \ " where n.nspname not in ('prestogres_catalog', 'information_schema')" \ " and n.nspname not like 'pg_%'" for row in plpy.cursor(sql): plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"])) # create schema and tables for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k): try: plpy.execute("create schema %s" % (plpy.quote_ident(schema_name))) except: # ignore error? pass # grant access on the all tables to the restricted user plpy.execute("grant select on all tables in schema %s to %s" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(access_role))) for table_name, columns in sorted(tables.items(), key=lambda (k,v): k): column_names = [] column_types = [] not_nulls = [] for column in columns: column_names.append(column.name) column_types.append(_pg_table_type(column.type)) not_nulls.append(not column.nullable) # change columns create_sql = _build_create_table(schema_name, table_name, column_names, column_types, not_nulls) plpy.execute(create_sql) # update pg_database plpy.execute("update pg_database set datname=%s where datname=current_database()" % \ plpy.quote_literal(presto_catalog))
def setup_system_catalog(presto_server, presto_user, presto_catalog, presto_schema, access_role): search_path = _get_session_search_path_array() if search_path == ['$user', 'public']: # search_path is default value. plpy.execute("set search_path to %s" % plpy.quote_ident(presto_schema)) client = presto_client.Client(server=presto_server, user=presto_user, catalog=presto_catalog, schema='default') # get table list sql = "select table_schema, table_name, column_name, is_nullable, data_type" \ " from information_schema.columns" columns, rows = client.run(sql) if rows is None: rows = [] schemas = {} for row in rows: schema_name = row[0] table_name = row[1] column_name = row[2] is_nullable = row[3] column_type = row[4] if schema_name == "sys" or schema_name == "information_schema": # skip system schemas continue if len(schema_name) > PG_NAMEDATALEN - 1: plpy.warning("Schema %s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), PG_NAMEDATALEN - 1)) continue tables = schemas.setdefault(schema_name, {}) if len(table_name) > PG_NAMEDATALEN - 1: plpy.warning("Table %s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), PG_NAMEDATALEN - 1)) continue columns = tables.setdefault(table_name, []) if len(column_name) > PG_NAMEDATALEN - 1: plpy.warning("Column %s.%s.%s is skipped because its name is longer than %d characters" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(table_name), \ plpy.quote_ident(column_name), PG_NAMEDATALEN - 1)) continue columns.append(Column(column_name, column_type, is_nullable)) # drop all schemas excepting prestogres_catalog, information_schema and pg_% sql = "select n.nspname as schema_name from pg_catalog.pg_namespace n" \ " where n.nspname not in ('prestogres_catalog', 'information_schema')" \ " and n.nspname not like 'pg_%'" for row in plpy.cursor(sql): plpy.execute("drop schema %s cascade" % plpy.quote_ident(row["schema_name"])) # create schema and tables for schema_name, tables in sorted(schemas.items(), key=lambda (k,v): k): try: plpy.execute("create schema %s" % (plpy.quote_ident(schema_name))) except: # ignore error? pass for table_name, columns in sorted(tables.items(), key=lambda (k,v): k): column_names = [] column_types = [] not_nulls = [] if len(columns) >= 1600: plpy.warning("Table %s.%s contains more than 1600 columns. Some columns will be inaccessible" % (plpy.quote_ident(schema_name), plpy.quote_ident(table_name))) for column in columns[0:1600]: column_names.append(column.name) column_types.append(_pg_table_type(column.type)) not_nulls.append(not column.nullable) # change columns column_names = _rename_duplicated_column_names(column_names, "%s.%s table" % (plpy.quote_ident(schema_name), plpy.quote_ident(table_name))) create_sql = _build_create_table(schema_name, table_name, column_names, column_types, not_nulls) plpy.execute(create_sql) # grant access on the schema to the restricted user so that # pg_table_is_visible(reloid) used by \d of psql command returns true plpy.execute("grant usage on schema %s to %s" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(access_role))) # this SELECT privilege is unnecessary because queries against those tables # won't run on PostgreSQL. causing an exception is good if Prestogres has # a bug sending a presto query to PostgreSQL without rewriting. # TODO however, it's granted for now because some BI tools might check # has_table_privilege. the best solution is to grant privilege but # actually selecting from those tables causes an exception. plpy.execute("grant select on all tables in schema %s to %s" % \ (plpy.quote_ident(schema_name), plpy.quote_ident(access_role))) # fake current_database() to return Presto's catalog name to be compatible with some # applications that use db.schema.table syntax to identify a table if plpy.execute("select pg_catalog.current_database()")[0].values()[0] != presto_catalog: plpy.execute("delete from pg_catalog.pg_proc where proname='current_database'") plpy.execute("create function pg_catalog.current_database() returns name as $$begin return %s::name; end$$ language plpgsql stable strict" % \ plpy.quote_literal(presto_catalog))
def log(msg): if plpy: plpy.warning(msg) else: logger.debug(msg)