def repartition_exceptions_lists(conn, *, num_physical_shards, src_filter_sql=None): """Function to repartition the exceptions_lists table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'): # Create parent partition cursor.execute( """CREATE TABLE exceptions_lists_new ( LIKE exceptions_lists INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id) """ ) _grant_perms_list(conn, part_name='exceptions_lists_new') # Work out who the operators are imei_shard_names = utils.child_table_names(conn, 'exceptions_lists') operators = [x.operator_id for x in utils.table_invariants_list(conn, imei_shard_names, ['operator_id'])] # Create child partitions (operator at top level, then IMEI-sharded) for op_id in operators: tbl_name = per_mno_lists_partition(operator_id=op_id, suffix='_new', list_type='exceptions') create_per_mno_lists_partition(conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, operator_id=op_id, num_physical_shards=num_physical_shards) # Insert data from original partition base_sql = sql.SQL("""INSERT INTO exceptions_lists_new SELECT * FROM exceptions_lists""") if src_filter_sql is not None: insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql)) else: insert_sql = base_sql cursor.execute(insert_sql) # Add in indexes to each partition add_indices(conn, tbl_name='exceptions_lists_new', idx_metadata=exceptions_lists_indices()) # Drop old table, after assigning sequence to new table cursor.execute('ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id') cursor.execute('DROP TABLE exceptions_lists CASCADE') # Rename tables, indexes and constraints rename_table_and_indices(conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists', idx_metadata=exceptions_lists_indices())
def _validate_data_partitions(config: callable, conn: callable, month: int, year: int, logger: callable, disable_data_check: bool) -> None: """ Validate that data is present for all configured operators and only configured operators. Arguments: config: DIRBS config object conn: DIRBS postgresql connection object month: data partition month year: data partition year logger: DIRBS logger object disable_data_check: boolean to disable data check Returns: None Raises: MissingOperatorDataException: if monthly_network_triplets_per_mno partition is missing for any operator ExtraOperatorDataException: if monthly_network_triplets_per_mno partition is detected for unconfigured mno """ operators = config.region_config.operators assert len(operators) > 0 operator_partitions = utils.child_table_names( conn, 'monthly_network_triplets_per_mno') observed_operator_ids = { x for x in utils.table_invariants_list(conn, operator_partitions, ['operator_id']) } required_operator_ids = {(o.id, ) for o in operators} missing_operator_ids = required_operator_ids - observed_operator_ids if len(missing_operator_ids) > 0: msg = 'Missing monthly_network_triplets_per_mno partitions for operators: {0}' \ .format(', '.join([x[0] for x in missing_operator_ids])) if disable_data_check: logger.warning(msg) else: logger.error(msg) raise exceptions.MissingOperatorDataException(msg) extra_operator_ids = observed_operator_ids - required_operator_ids if len(extra_operator_ids) > 0: msg = 'Extra monthly_network_triplets_per_mno partitions detected for unconfigured operators: {0}' \ .format(', '.join([x[0] for x in extra_operator_ids])) if disable_data_check: logger.warning(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg) operator_monthly_partitions = set() for op_partition in operator_partitions: operator_monthly_partitions.update( utils.child_table_names(conn, op_partition)) observed_invariants = { x for x in utils.table_invariants_list( conn, operator_monthly_partitions, ['operator_id', 'triplet_year', 'triplet_month']) } observed_invariants = { x for x in observed_invariants if x.triplet_year == year and x.triplet_month == month } required_invariants = {(o.id, year, month) for o in operators} missing_invariants = required_invariants - observed_invariants if len(missing_invariants) > 0: msg = 'Missing monthly_network_triplets_per_mno partitions for the requested reporting ' \ 'month for the following configured operators: {0}' \ .format(', '.join([x[0] for x in missing_invariants])) if disable_data_check: logger.warning(msg) else: logger.error(msg) raise exceptions.MissingOperatorDataException(msg) extra_invariants = observed_invariants - required_invariants if len(extra_invariants) > 0: msg = 'Extra monthly_network_triplets_per_mno partitions detected for the requested ' \ 'reporting month for the following unconfigured operators: {0}' \ .format(', '.join([x[0] for x in extra_invariants])) if disable_data_check: logger.warning(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg) country_imei_shard_name = partition_utils.monthly_network_triplets_country_partition( month=month, year=year) with conn.cursor() as cursor: cursor.execute(utils.table_exists_sql(), [country_imei_shard_name]) partition_exists = cursor.fetchone()[0] if not partition_exists: msg = 'Missing monthly_network_triplets_country partition for year and month' if disable_data_check: logger.warning(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg)
def upgrade(self, db_conn): # noqa: C901 """Overrides AbstractMigrator upgrade method.""" logger = logging.getLogger('dirbs.db') with db_conn.cursor() as cursor: cursor.execute( """CREATE FUNCTION calc_virt_imei_shard(imei TEXT) RETURNS SMALLINT LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE AS $$ BEGIN RETURN SUBSTRING(COALESCE(imei, ''), 13, 2)::SMALLINT; EXCEPTION WHEN OTHERS THEN RETURN 0; END; $$""") # By default, create 4 shards num_initial_shards = 4 logger.info('Re-partitioning classification_state table...') cursor.execute( 'ALTER TABLE classification_state ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE classification_state SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE classification_state ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_classification_state( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned classification_state table') logger.info('Re-partitioning registration_list table...') cursor.execute( 'ALTER TABLE historic_registration_list ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE historic_registration_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE historic_registration_list ALTER COLUMN virt_imei_shard SET NOT NULL' ) self.partition_registration_list( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned registration_list table') logger.info('Re-partitioning pairing_list table...') cursor.execute( 'ALTER TABLE historic_pairing_list ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE historic_pairing_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE historic_pairing_list ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_pairing_list( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned pairing_list table') logger.info('Re-partitioning blacklist table...') cursor.execute( 'ALTER TABLE blacklist ADD COLUMN virt_imei_shard SMALLINT') cursor.execute( 'UPDATE blacklist SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE blacklist ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_blacklist( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned blacklist table') # Need to make sure owner of list tables is dirbs_core_listgen logger.info('Re-partitioning notifications_lists table...') # The original notifications_lists were not created with a single sequence for the IDs, so just do now with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'): cursor.execute( """CREATE UNLOGGED TABLE notifications_lists_new ( row_id BIGSERIAL NOT NULL, operator_id TEXT NOT NULL, imei_norm TEXT NOT NULL, imsi TEXT NOT NULL, msisdn TEXT NOT NULL, block_date DATE NOT NULL, reasons TEXT[] NOT NULL, amnesty_granted BOOLEAN DEFAULT FALSE NOT NULL, start_run_id BIGINT NOT NULL, end_run_id BIGINT, delta_reason TEXT NOT NULL CHECK (delta_reason IN ('new', 'resolved', 'blacklisted', 'no_longer_seen', 'changed')), virt_imei_shard SMALLINT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out who the operators are partitions = utils.child_table_names(db_conn, 'notifications_lists') # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen'). format(sql.Identifier(p))) operators = [ x.operator_id for x in utils.table_invariants_list( db_conn, partitions, ['operator_id']) ] # Create operator child partitions for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='notifications') part_utils.create_per_mno_lists_partition( db_conn, operator_id=op_id, parent_tbl_name='notifications_lists_new', tbl_name=tbl_name, num_physical_shards=1, unlogged=True, fillfactor=100) cursor.execute( """INSERT INTO notifications_lists_new(operator_id, imei_norm, imsi, msisdn, block_date, reasons, start_run_id, end_run_id, delta_reason, virt_imei_shard) SELECT operator_id, imei_norm, imsi, msisdn, block_date, reasons, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm) FROM notifications_lists """) # Drop old table, rename tables, indexes and constraints cursor.execute("""ALTER TABLE notifications_lists_new RENAME CONSTRAINT notifications_lists_new_delta_reason_check TO notifications_lists_delta_reason_check""") cursor.execute('DROP TABLE notifications_lists CASCADE') cursor.execute("""ALTER SEQUENCE notifications_lists_new_row_id_seq RENAME TO notifications_lists_row_id_seq""") part_utils.rename_table_and_indices( db_conn, old_tbl_name='notifications_lists_new', new_tbl_name='notifications_lists') part_utils.repartition_notifications_lists( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned notifications_lists table') logger.info('Re-partitioning exceptions_lists table...') # The original exceptions_lists were not created with a single sequence for the IDs, so just do now with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'): cursor.execute("""CREATE UNLOGGED TABLE exceptions_lists_new ( row_id BIGSERIAL NOT NULL, operator_id TEXT NOT NULL, imei_norm TEXT NOT NULL, imsi TEXT NOT NULL, start_run_id BIGINT NOT NULL, end_run_id BIGINT, delta_reason TEXT NOT NULL CHECK (delta_reason IN ('added', 'removed')), virt_imei_shard SMALLINT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out who the operators are partitions = utils.child_table_names(db_conn, 'exceptions_lists') # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen'). format(sql.Identifier(p))) operators = [ x.operator_id for x in utils.table_invariants_list( db_conn, partitions, ['operator_id']) ] # Create operator child partitions for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='exceptions') part_utils.create_per_mno_lists_partition( db_conn, operator_id=op_id, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, num_physical_shards=1, unlogged=True, fillfactor=100) cursor.execute( """INSERT INTO exceptions_lists_new(operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason, virt_imei_shard) SELECT operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm) FROM exceptions_lists """) # Drop old table, rename tables, indexes and constraints cursor.execute("""ALTER TABLE exceptions_lists_new RENAME CONSTRAINT exceptions_lists_new_delta_reason_check TO exceptions_lists_delta_reason_check""") cursor.execute('DROP TABLE exceptions_lists CASCADE') cursor.execute( 'ALTER SEQUENCE exceptions_lists_new_row_id_seq RENAME TO exceptions_lists_row_id_seq' ) part_utils.rename_table_and_indices( db_conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists') part_utils.repartition_exceptions_lists( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned exceptions_lists table') logger.info('Re-partitioning seen_imeis (network_imeis) table') # First, just put everything in a temporary table so that we can call partutils with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute("""CREATE UNLOGGED TABLE network_imeis ( first_seen DATE NOT NULL, last_seen DATE NOT NULL, seen_rat_bitmask INTEGER, imei_norm TEXT NOT NULL, virt_imei_shard SMALLINT NOT NULL ) """) # # We disable index scans here as doing a merge append with index scans is much slower and involves # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort # the results by imei_norm # cursor.execute('SET enable_indexscan = false') cursor.execute("""INSERT INTO network_imeis SELECT MIN(first_seen), MAX(last_seen), bit_or(seen_rat_bitmask), imei_norm, calc_virt_imei_shard(imei_norm) FROM seen_imeis GROUP BY imei_norm """) cursor.execute('SET enable_indexscan = true') part_utils.repartition_network_imeis( db_conn, num_physical_shards=num_initial_shards) cursor.execute('DROP TABLE seen_imeis CASCADE') logger.info('Re-partitioned seen_imeis (network_imeis) table') # First, just put all country-level triplets in a temporary table so that we can call partition_utils with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute( """CREATE UNLOGGED TABLE monthly_network_triplets_country ( triplet_year SMALLINT NOT NULL, triplet_month SMALLINT NOT NULL, first_seen DATE NOT NULL, last_seen DATE NOT NULL, date_bitmask INTEGER NOT NULL, triplet_hash UUID NOT NULL, imei_norm TEXT, imsi TEXT, msisdn TEXT, virt_imei_shard SMALLINT NOT NULL, CHECK (last_seen >= first_seen), CHECK (EXTRACT(month FROM last_seen) = triplet_month AND EXTRACT(year FROM last_seen) = triplet_year), CHECK (EXTRACT(month FROM first_seen) = triplet_month AND EXTRACT(year FROM first_seen) = triplet_year) ) PARTITION BY RANGE (triplet_year, triplet_month) """) # Work out what partitions to create and create them partitions = utils.child_table_names(db_conn, 'seen_triplets') # Make sure that they are owned by dirbs_core_import_operator (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL( 'ALTER TABLE {0} OWNER TO dirbs_core_import_operator' ).format(sql.Identifier(p))) year_month_tuples = { (x.triplet_year, x.triplet_month) for x in utils.table_invariants_list( db_conn, partitions, ['triplet_year', 'triplet_month']) } for year, month in year_month_tuples: part_utils.create_monthly_network_triplets_country_partition( db_conn, month=month, year=year, num_physical_shards=1) with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute( """CREATE UNLOGGED TABLE monthly_network_triplets_per_mno ( LIKE monthly_network_triplets_country INCLUDING ALL, operator_id TEXT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out what partitions to create and create them op_year_month_tuples = { (x.operator_id, x.triplet_year, x.triplet_month) for x in utils.table_invariants_list( db_conn, partitions, ['operator_id', 'triplet_year', 'triplet_month']) } # Create child partitions at per-MNO level for op, year, month in op_year_month_tuples: part_utils.create_monthly_network_triplets_per_mno_partition( db_conn, operator_id=op, month=month, year=year, num_physical_shards=1) # Create temporary monthly_network_triplets_per_mno table for year, month in year_month_tuples: logger.info( 'Generating temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}...' .format(month, year)) cursor.execute( """INSERT INTO monthly_network_triplets_per_mno SELECT %(year)s, %(month)s, first_seen, last_seen, date_bitmask, triplet_hash, imei_norm, imsi, msisdn, calc_virt_imei_shard(imei_norm), operator_id FROM seen_triplets WHERE triplet_year = %(year)s AND triplet_month = %(month)s """, { 'year': year, 'month': month }) logger.info( 'Generated temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}' .format(month, year)) # Create temporary monthly_network_triplets_country table. We need to do this monthly as we need # to aggregate by triplets on a monthly basis # # We disable index scans here as doing a merge append with index scans is much slower and involves # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort # the results by imei_norm # cursor.execute('SET enable_indexscan = false') for year, month in year_month_tuples: logger.info( 'Generating temporary monthly_network_triplets_country entries for {0:02d}/{1:d}...' .format(month, year)) cursor.execute( """INSERT INTO monthly_network_triplets_country SELECT %(year)s, %(month)s, MIN(first_seen), MAX(last_seen), bit_or(date_bitmask), triplet_hash, FIRST(imei_norm), FIRST(imsi), FIRST(msisdn), calc_virt_imei_shard(FIRST(imei_norm)) FROM seen_triplets WHERE triplet_year = %(year)s AND triplet_month = %(month)s GROUP BY triplet_hash """, { 'year': year, 'month': month }) logger.info( 'Generated temporary monthly_network_triplets_country entries for {0:02d}/{1:d}' .format(month, year)) cursor.execute('SET enable_indexscan = true') logger.info( 'Re-partitioning temporary monthly_network_triplets tables...') # Previously, the operator_data view was owned by dirbs_core_power_user but is now owned by the # dirbs_core_import_operator since it must be re-created with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): cursor.execute( 'ALTER VIEW operator_data OWNER TO dirbs_core_import_operator' ) part_utils.repartition_monthly_network_triplets( db_conn, num_physical_shards=num_initial_shards) cursor.execute('DROP TABLE seen_triplets CASCADE') logger.info( 'Re-partitioned temporary monthly_network_triplets tables') # Replace list generation function to include virt_imei_shard cursor.execute(""" DROP FUNCTION gen_blacklist(run_id BIGINT); DROP FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT); DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT); -- -- Create function to generate a full blacklist for a given run_id. A value of -1 means get the latest -- list. -- CREATE FUNCTION gen_blacklist(run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, block_date DATE, reasons TEXT[] ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ DECLARE query_run_id BIGINT; BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows -- where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT bl.imei_norm, bl.virt_imei_shard, bl.block_date, bl.reasons FROM blacklist bl WHERE bl.delta_reason != 'unblocked' AND run_id >= bl.start_run_id AND (run_id < bl.end_run_id OR bl.end_run_id IS NULL); END $$; -- -- Create function to generate a full notifications_list for a given run_id and operator ID. A value -- of -1 means get the latest list. -- CREATE FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT, msisdn TEXT, block_date DATE, reasons TEXT[], amnesty_granted BOOLEAN ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows -- where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT nl.imei_norm, nl.virt_imei_shard, nl.imsi, nl.msisdn, nl.block_date, nl.reasons, nl.amnesty_granted FROM notifications_lists nl WHERE nl.operator_id = op_id AND nl.delta_reason NOT IN ('resolved', 'blacklisted') AND run_id >= nl.start_run_id AND (run_id < nl.end_run_id OR nl.end_run_id IS NULL); END $$; -- -- Create function to generate a full exceptions_list for a given run_id and operator ID. A value -- of -1 means get the latest list. -- CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all -- rows where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT el.imei_norm, el.virt_imei_shard, el.imsi FROM exceptions_lists el WHERE el.operator_id = op_id AND el.delta_reason != 'removed' AND run_id >= el.start_run_id AND (run_id < el.end_run_id OR el.end_run_id IS NULL); END $$; """) # noqa: Q440, Q441 # Update schema metadata table cursor.execute( """ALTER TABLE schema_metadata ADD COLUMN phys_shards SMALLINT NOT NULL DEFAULT %s CHECK (phys_shards > 0 AND phys_shards <= 100)""", [num_initial_shards]) cursor.execute( 'ALTER TABLE schema_metadata ALTER COLUMN phys_shards DROP DEFAULT' ) # Drop obsolete columns cursor.execute( 'ALTER TABLE schema_metadata DROP COLUMN potential_whitespace_imsis_msisdns' ) cursor.execute( 'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_imsi_records' ) cursor.execute( 'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_msisdn_records' )
def triplets(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune old monthly_network_triplets data.""" curr_date = ctx.obj['CURR_DATE'] # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, retention_months=config.retention_config.months_retention) if curr_date is None: curr_date = datetime.date.today() with conn.cursor() as cursor: logger.info( 'Pruning monthly_network_triplets data outside the retention window from database...' ) retention_months = config.retention_config.months_retention first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'monthly_network_triplets partitions older than {0} will be pruned' .format(first_month_to_drop)) country_monthly_partitions = utils.child_table_names( conn, 'monthly_network_triplets_country') operator_partitions = utils.child_table_names( conn, 'monthly_network_triplets_per_mno') operator_monthly_partitions = [] for op_partition in operator_partitions: operator_monthly_partitions.extend( utils.child_table_names(conn, op_partition)) parent_tbl_names = [ 'monthly_network_triplets_country', 'monthly_network_triplets_per_mno' ] rows_before = {} for tbl in parent_tbl_names: logger.debug( 'Calculating original number of rows in {0} table...'.format( tbl)) cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl))) rows_before[tbl] = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows in {0} table'.format(tbl)) statsd.gauge('{0}.{1}.rows_before'.format(metrics_run_root, tbl), rows_before[tbl]) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) total_rows_pruned = 0 total_partitions = country_monthly_partitions + operator_monthly_partitions for tblname in total_partitions: invariants_list = utils.table_invariants_list( conn, [tblname], ['triplet_month', 'triplet_year']) assert len(invariants_list) <= 1 if len(invariants_list) == 0: logger.warning( 'Found empty partition {0}. Dropping...'.format(tblname)) cursor.execute( sql.SQL("""DROP TABLE {0} CASCADE""").format( sql.Identifier(tblname))) else: month, year = tuple(invariants_list[0]) # Check if table year/month is outside the retention window if (datetime.date(year, month, 1) < first_month_to_drop): # Calculate number of rows in the partition table cursor.execute( sql.SQL("""SELECT COUNT(*) FROM {0}""").format( sql.Identifier(tblname))) partition_table_rows = cursor.fetchone()[0] total_rows_pruned += partition_table_rows logger.info('Dropping table {0} with {1} rows...'.format( tblname, partition_table_rows)) cursor.execute( sql.SQL("""DROP TABLE {0} CASCADE""").format( sql.Identifier(tblname))) logger.info('Dropped table {0}'.format(tblname)) rows_after = {} for tbl in parent_tbl_names: logger.debug( 'Calculating new number of rows in {0} table...'.format(tbl)) cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl))) rows_after[tbl] = cursor.fetchone()[0] logger.debug( 'Calculated new number of rows in {0} table'.format(tbl)) statsd.gauge('{0}.{1}.rows_after'.format(metrics_run_root, tbl), rows_after[tbl]) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after) total_rows_before = sum(rows_before.values()) total_rows_after = sum(rows_after.values()) assert (total_rows_before - total_rows_after) == total_rows_pruned logger.info( 'Pruned {0:d} rows of monthly_network_triplets data outside the retention window from database' .format(total_rows_pruned))
def _repartition_exceptions_lists(self, conn, *, num_physical_shards): """Repartition the exceptions lists to support msisdn.""" with conn.cursor() as cursor, utils.db_role_setter( conn, role_name='dirbs_core_listgen'): cursor.execute("""CREATE TABLE exceptions_lists_new ( LIKE exceptions_lists INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id); ALTER TABLE exceptions_lists_new ADD COLUMN msisdn TEXT NOT NULL; """) part_utils._grant_perms_list( conn, part_name='exceptions_lists_new') # grant relevant permissions imei_shard_names = utils.child_table_names( conn, 'exceptions_lists') # determine the child table names operators = [ o.operator_id for o in utils.table_invariants_list( conn, imei_shard_names, ['operator_id']) ] # workout who the operators are # create child partitions for new list (operator at top level, then IMEI sharded) for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='exceptions') part_utils.create_per_mno_lists_partition( conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, operator_id=op_id, num_physical_shards=num_physical_shards) # insert data into the new parent partition cursor.execute("""INSERT INTO exceptions_lists_new SELECT e.row_id, e.operator_id, e.imei_norm, e.imsi, e.start_run_id, e.end_run_id, e.delta_reason, e.virt_imei_shard, p.msisdn FROM exceptions_lists e INNER JOIN historic_pairing_list p ON e.imsi = p.imsi""" ) # add indexes in each partitions part_utils.add_indices( conn, tbl_name='exceptions_lists_new', idx_metadata=part_utils.exceptions_lists_indices()) # drop old table, after assigning sequence to new table cursor.execute( 'ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id' ) cursor.execute('DROP TABLE exceptions_lists CASCADE') # rename table, indexes and constraints part_utils.rename_table_and_indices( conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists', idx_metadata=part_utils.exceptions_lists_indices()) # recreating gen_exceptionlist function with utils.db_role_setter(conn, role_name='dirbs_core_power_user'): cursor.execute(""" DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT); -- -- Recreate function to generate a full exceptions_list for a given -- run_id and operator. -- A value of -1 means get the latest list. -- CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT, msisdn TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always -- return all rows where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT el.imei_norm, el.virt_imei_shard, el.imsi, el.msisdn FROM exceptions_lists el WHERE el.operator_id = op_id AND el.delta_reason != 'removed' AND run_id >= el.start_run_id AND (run_id < el.end_run_id OR el.end_run_id IS NULL); END $$; DROP FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT); -- -- Create function to generate a per-MNO delta exceptions list for a run_id, operator -- id and optional base_run_id. -- -- If not base_run_id is supplied, this function will use the maximum run_id found in -- the DB that it less than than the supplied run_id -- CREATE FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, imsi TEXT, msisdn TEXT, delta_reason TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id -- IF run_id = -1 THEN run_id := max_bigint(); END IF; IF run_id < base_run_id THEN RAISE EXCEPTION 'Parameter base_run_id % greater than run_id %', base_run_id, run_id; END IF; RETURN QUERY SELECT * FROM (SELECT el.imei_norm, el.imsi, el.msisdn, overall_delta_reason(el.delta_reason ORDER BY start_run_id DESC) AS delta_reason FROM exceptions_lists el WHERE operator_id = op_id AND start_run_id > base_run_id AND start_run_id <= run_id GROUP BY el.imei_norm, el.imsi, el.msisdn) x WHERE x.delta_reason IS NOT NULL; END $$; """) # noqa: Q440, Q441
def repartition_monthly_network_triplets(conn, *, num_physical_shards): """Function to repartition the monthly_network_triplets_country and monthly_network_triplets_country tables.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'): # Create parent partitions cursor.execute( """CREATE TABLE monthly_network_triplets_country_new ( LIKE monthly_network_triplets_country INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (triplet_year, triplet_month) """ ) _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_country_new') cursor.execute( """CREATE TABLE monthly_network_triplets_per_mno_new ( LIKE monthly_network_triplets_per_mno INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id) """ ) _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_per_mno_new') # Work out what year-month tuples we have country_monthly_partitions = utils.child_table_names(conn, 'monthly_network_triplets_country') country_year_month_tuples = [(x.triplet_year, x.triplet_month) for x in utils.table_invariants_list(conn, country_monthly_partitions, ['triplet_year', 'triplet_month'])] operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno') operator_monthly_partitions = set() for op_partition in operator_partitions: operator_monthly_partitions.update(utils.child_table_names(conn, op_partition)) mno_year_month_tuples = [(x.operator_id, x.triplet_year, x.triplet_month) for x in utils.table_invariants_list(conn, operator_monthly_partitions, ['operator_id', 'triplet_year', 'triplet_month'])] latest_year_month = None # Sort year month tuples and get the maximum year month combination. country_year_month_tuples = sorted(country_year_month_tuples, key=lambda x: (x[0], x[1]), reverse=True) if len(country_year_month_tuples) > 0: latest_year_month = country_year_month_tuples[0] # Create child partitions at country level for year, month in country_year_month_tuples: # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we # pack tightly to ensure optimal usage of disk space and optimal scan performance latest_year, latest_month = latest_year_month fillfactor = 45 if year == latest_year and month == latest_month else 100 create_monthly_network_triplets_country_partition(conn, month=month, year=year, suffix='_new', num_physical_shards=num_physical_shards, fillfactor=fillfactor) # Create child partitions at per-MNO level for op, year, month in mno_year_month_tuples: # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we # pack tightly to ensure optimal usage of disk space and optimal scan performance latest_year, latest_month = latest_year_month fillfactor = 45 if year == latest_year and month == latest_month else 100 create_monthly_network_triplets_per_mno_partition(conn, operator_id=op, month=month, year=year, suffix='_new', num_physical_shards=num_physical_shards, fillfactor=fillfactor) # Populate country-level table from old table cursor.execute("""INSERT INTO monthly_network_triplets_country_new SELECT * FROM monthly_network_triplets_country""") # Populate per-MNO-level table from old table cursor.execute("""INSERT INTO monthly_network_triplets_per_mno_new SELECT * FROM monthly_network_triplets_per_mno""") # Add in indexes add_indices(conn, tbl_name='monthly_network_triplets_country_new', idx_metadata=monthly_network_triplets_country_indices()) add_indices(conn, tbl_name='monthly_network_triplets_per_mno_new', idx_metadata=monthly_network_triplets_per_mno_indices()) # Drop old tables cursor.execute('DROP TABLE monthly_network_triplets_country CASCADE') cursor.execute('DROP TABLE monthly_network_triplets_per_mno CASCADE') # Renames tables rename_table_and_indices(conn, old_tbl_name='monthly_network_triplets_country_new', new_tbl_name='monthly_network_triplets_country', idx_metadata=monthly_network_triplets_country_indices()) rename_table_and_indices(conn, old_tbl_name='monthly_network_triplets_per_mno_new', new_tbl_name='monthly_network_triplets_per_mno', idx_metadata=monthly_network_triplets_per_mno_indices()) cursor.execute("""CREATE OR REPLACE VIEW operator_data AS SELECT sq.connection_date, sq.imei_norm, sq.imsi, sq.msisdn, sq.operator_id FROM (SELECT make_date(nt.triplet_year::integer, nt.triplet_month::integer, dom.dom) AS connection_date, nt.imei_norm, nt.imsi, nt.msisdn, nt.operator_id FROM generate_series(1, 31) dom(dom), monthly_network_triplets_per_mno nt WHERE (nt.date_bitmask & (1 << (dom.dom - 1))) <> 0) sq""") cursor.execute("""CREATE VIEW monthly_network_triplets_country_no_null_imeis AS SELECT * FROM monthly_network_triplets_country WHERE imei_norm IS NOT NULL""") cursor.execute("""CREATE VIEW monthly_network_triplets_per_mno_no_null_imeis AS SELECT * FROM monthly_network_triplets_per_mno WHERE imei_norm IS NOT NULL""") cursor.execute(sql.SQL('GRANT SELECT ON operator_data TO dirbs_core_base')) for role in ['dirbs_core_listgen', 'dirbs_core_classify', 'dirbs_core_report', 'dirbs_core_api']: cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_country_no_null_imeis TO {0}""").format(sql.Identifier(role))) cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_per_mno_no_null_imeis TO {0}""").format(sql.Identifier(role))) cursor.execute("""CREATE VIEW monthly_network_triplets_with_invalid_data_flags AS SELECT nt.*, nt.imei_norm IS NULL AS is_null_imei, is_unclean_imei(nt.imei_norm) AS is_unclean_imei, nt.imsi IS NULL AS is_null_imsi, is_unclean_imsi(nt.imsi) AS is_unclean_imsi, nt.msisdn IS NULL AS is_null_msisdn FROM monthly_network_triplets_per_mno nt""")
def _validate_data_partitions(config, conn, month, year, logger, disable_data_check): """ Validate that data is present for all configured operators and only configured operators. :param config: dirbs config obj :param conn: database conection :param month: data month :param year: data year :param logger: dirbs logger obj :param disable_data_check: data check flag """ operators = config.region_config.operators assert len(operators) > 0 operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno') observed_operator_ids = {x for x in utils.table_invariants_list(conn, operator_partitions, ['operator_id'])} required_operator_ids = {(o.id,) for o in operators} missing_operator_ids = required_operator_ids - observed_operator_ids if len(missing_operator_ids) > 0: msg = 'Missing monthly_network_triplets_per_mno partitions for operators: {0}' \ .format(', '.join([x[0] for x in missing_operator_ids])) if disable_data_check: logger.warn(msg) else: logger.error(msg) raise exceptions.MissingOperatorDataException(msg) extra_operator_ids = observed_operator_ids - required_operator_ids if len(extra_operator_ids) > 0: msg = 'Extra monthly_network_triplets_per_mno partitions detected for unconfigured operators: {0}' \ .format(', '.join([x[0] for x in extra_operator_ids])) if disable_data_check: logger.warn(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg) operator_monthly_partitions = set() for op_partition in operator_partitions: operator_monthly_partitions.update(utils.child_table_names(conn, op_partition)) observed_invariants = {x for x in utils.table_invariants_list(conn, operator_monthly_partitions, ['operator_id', 'triplet_year', 'triplet_month'])} observed_invariants = {x for x in observed_invariants if x.triplet_year == year and x.triplet_month == month} required_invariants = {(o.id, year, month) for o in operators} missing_invariants = required_invariants - observed_invariants if len(missing_invariants) > 0: msg = 'Missing monthly_network_triplets_per_mno partitions for the requested reporting ' \ 'month for the following configured operators: {0}' \ .format(', '.join([x[0] for x in missing_invariants])) if disable_data_check: logger.warn(msg) else: logger.error(msg) raise exceptions.MissingOperatorDataException(msg) extra_invariants = observed_invariants - required_invariants if len(extra_invariants) > 0: msg = 'Extra monthly_network_triplets_per_mno partitions detected for the requested ' \ 'reporting month for the following unconfigured operators: {0}' \ .format(', '.join([x[0] for x in extra_invariants])) if disable_data_check: logger.warn(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg) country_imei_shard_name = partition_utils.monthly_network_triplets_country_partition(month=month, year=year) with conn.cursor() as cursor: cursor.execute(utils.table_exists_sql(), [country_imei_shard_name]) partition_exists = cursor.fetchone()[0] if not partition_exists: msg = 'Missing monthly_network_triplets_country partition for year and month' if disable_data_check: logger.warn(msg) else: logger.error(msg) raise exceptions.ExtraOperatorDataException(msg)