def add_missing_columns(conn, schema, table_name, columns_to_check): """ Add missing columns in `table_name`. :param conn: psycopg2 database connection :param schema: name of schema where specified table is located :param table_name: name of table that will be updated :param colums_to_check: list of tuples (column_name, data_type) specifying columns that must be checked and added when missing """ with closing(conn.cursor()) as cursor: for (column_name, data_type) in columns_to_check: if not column_exists(conn, schema, table_name, column_name): create_column(cursor, Table(schema, table_name), column_name, data_type)
def retrieve_aggregated(conn, datasource, granularity, entitytype, column_identifiers, interval, group_by, subquery_filter=None, relation_table_name=None): """ Return aggregated data :param conn: psycopg2 database connection :param datasource: datasource object :param granularity: granularity in seconds :param entitytype: entitytype object :param column_identifiers: e.g. SUM(trend1), MAX(trend2) :param interval: (start, end) tuple with non-naive timestamps :param group_by: list of columns to GROUP BY :param subquery_filter: optional subquery for additional filtering by JOINing on field 'id' = entity_id :param relation_table_name: optional relation table name for converting entity ids to related ones """ start, end = interval with closing(conn.cursor()) as cursor: source_table_names = get_table_names_v4(cursor, [datasource], granularity, entitytype, start, end) def get_trend_names(column_identifier): if isinstance(column_identifier, Sql): return [a.name for a in column_identifier.args] else: trend_names_part = re.match(".*\(([\w, ]+)\)", column_identifier).group(1) return map(str.strip, trend_names_part.split(",")) trend_names = set(chain(*map(get_trend_names, column_identifiers))) #Deal with 'samples' column if column_exists(conn, SCHEMA, source_table_names[-1], "samples"): select_samples_part = "SUM(samples)" select_samples_column = "samples," else: select_samples_part = "COUNT(*)" select_samples_column = "" args = {"start": start, "end": end} select_parts = [] for source_table_name in source_table_names: join_parts = [] return_id_field = "entity_id" if subquery_filter: join_parts.append( "JOIN ({0}) AS filter ON filter.id = \"{1}\".{2}.entity_id".format( subquery_filter, SCHEMA, enquote_column_name(source_table_name))) if relation_table_name: return_id_field = "r.target_id AS entity_id" join_parts.append( "JOIN relation.\"{0}\" r ON r.source_id = \"{1}\".entity_id".format( relation_table_name, source_table_name)) select_parts.append( "SELECT {0}, %(end)s, {1} {2} FROM \"{3}\".\"{4}\" {5}" " WHERE timestamp > %(start)s AND timestamp <= %(end)s".format( return_id_field, select_samples_column, ",".join(map(enquote_column_name, trend_names)), SCHEMA, source_table_name, " ".join(join_parts))) query = ("SELECT entity_id, %(end)s, {0}, {1} FROM( {2} ) " "AS sources GROUP BY {3}").format( select_samples_part, ",".join(map(quote_ident, column_identifiers)), " UNION ALL ".join(select_parts), ",".join(map(enquote_column_name, group_by))) all_rows = [] with closing(conn.cursor()) as cursor: try: cursor.execute(query, args) except psycopg2.ProgrammingError: logging.debug(cursor.mogrify(query, args)) conn.rollback() # TODO: Check error code else: all_rows = cursor.fetchall() return all_rows
def aggregate(conn, schema, source, target, trend_names, timestamp): """ Basic aggregation of trend data :param conn: psycopg2 database connection :param schema: schema where source and target data is located :param source: tuple (datasource, gp, entitytype_name) specifying source :param target: tuple (datasource, gp, entitytype_name) specifying target :param trend_names: trends to aggregate :param timestamp: non-naive timestamp specifying end of interval to aggregate """ target_gp = target[1] interval = (get_previous_timestamp(timestamp, target_gp), timestamp) (ds, gp, et_name) = source source_table_names = get_table_names( [ds], gp, et_name, interval[0], interval[1]) target_table_name = make_table_name(*(target + (timestamp,))) if column_exists(conn, schema, source_table_names[-1], "samples"): select_samples_part = "SUM(samples)" select_samples_column = "samples," else: select_samples_part = "COUNT(*)" select_samples_column = "" select_parts = [] for source_table_name in source_table_names: select_parts.append( "SELECT " "entity_id, '{1}', {2} {3} " "FROM \"{0}\".\"{4}\" " "WHERE timestamp > %s AND timestamp <= %s ".format( schema, timestamp.strftime("%Y-%m-%d %H:%M:%S"), select_samples_column, ",".join(["\"{0}\"".format(tn) for tn in trend_names]), source_table_name)) query = ( "INSERT INTO \"{0}\".\"{1}\" (entity_id, timestamp, samples, {2}) " "SELECT entity_id, '{4}', {5}, {6} FROM " "( {3} ) AS sources " "GROUP BY entity_id".format( schema, target_table_name, ",".join(["\"{0}\"".format(tn) for tn in trend_names]), " UNION ALL ".join(select_parts), timestamp.strftime("%Y-%m-%d %H:%M:%S"), select_samples_part, ",".join(["SUM(\"{0}\")".format(tn) for tn in trend_names]))) retry = True attempt = 0 #Strategy followed in code below is like trend_storage.store() function while retry is True: retry = False attempt += 1 if attempt > MAX_RETRIES: raise MaxRetriesError("Max retries ({0}) reached".format(MAX_RETRIES)) try: with closing(conn.cursor()) as cursor: cursor.execute(query, len(source_table_names) * interval) except psycopg2.DatabaseError as exc: conn.rollback() columns = [("samples", "integer")] columns.extend(zip(trend_names, get_data_types(conn, schema, source_table_names[-1], trend_names))) if exc.pgcode == psycopg2.errorcodes.NUMERIC_VALUE_OUT_OF_RANGE: max_values = [] for source_table_name in source_table_names: query_max_values = ( "SELECT {0} FROM " "(SELECT " " {1} " "FROM \"{2}\".\"{3}\" " "WHERE timestamp > %s AND timestamp <= %s " "GROUP BY entity_id) AS sums" ).format( ",".join(["MAX(\"{0}\")".format(tn) for tn in trend_names]), ",".join(["SUM(\"{0}\") AS \"{0}\"".format(tn) for tn in trend_names]), schema, source_table_name) with closing(conn.cursor()) as cursor: cursor.execute(query_max_values, interval) max_values.append(cursor.fetchone()) data_types = [datatype.extract_from_value(v) for v in map(max, zip(*max_values))] check_column_types(conn, schema, target_table_name, trend_names, data_types) retry = True elif exc.pgcode == psycopg2.errorcodes.UNIQUE_VIOLATION: raise NonRecoverableError("{0}, {1!s} in query '{2}'".format( exc.pgcode, exc, query)) # TODO: remove unique violating record from target # retry = True elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_COLUMN: column_names, data_types = zip(*columns) add_missing_columns(conn, schema, target_table_name, zip(column_names, data_types)) retry = True elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE: column_names, data_types = zip(*columns) create_trend_table(conn, schema, target_table_name, column_names, data_types) retry = True else: raise NonRecoverableError("{0}, {1!s} in query '{2}'".format( exc.pgcode, exc, query)) else: conn.commit()