def _do_final_cleanup(conn, logger, is_locked, tables_to_delete): """ Function to perform final cleanup to remove intermediate tables and release locks. :param conn: database connection obj :param logger: dirbs logger obj :param is_locked: bool (to check if there is postgres advisory lock) :param tables_to_delete: list of tables to delete """ if is_locked: with conn.cursor() as cursor: cursor.execute('SELECT pg_advisory_unlock(%s::BIGINT)', [hash_string_64bit('dirbs-classify')]) with conn.cursor() as cursor: remaining_tables_to_delete = copy.copy(tables_to_delete) for t in tables_to_delete: try: cursor.execute( sql.SQL('DROP TABLE IF EXISTS {0} CASCADE').format( sql.Identifier(t))) conn.commit() remaining_tables_to_delete.remove(t) except: # noqa: E722 for t_not_deleted in remaining_tables_to_delete: logger.warn( 'Failed to drop table {0} due to exception. Please issue ' '\'DROP TABLE IF EXISTS {0}\' manually!'.format( t_not_deleted)) raise
def _import_lock_key(self): """String Key for the advisory lock to guard against multiple concurrent imports of the same type. Subclasses should override if they want to allow concurrent imports. For example, the operator data importer allows multiples to happen as long as they are for different operators. """ return hash_string_64bit(self._import_type)
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, conditions, safety_check, curr_date, disable_sanity_checks): """ DIRBS script to classify IMEIs. Iterates through all configured conditions and write to the classification_state table. :param ctx: click command context :param config: dirbs config instance :param statsd: statsd instance :param logger: dirbs logger instance :param run_id: job run id :param conn: database connection :param metadata_conn: database connection for job metadata :param command: command name :param metrics_root: :param metrics_run_root: :param conditions: list of user supplied conditions :param safety_check: bool (enable/disable safety check) :param curr_date: date to use for classification :param disable_sanity_checks: bool (enable/disable sanity checks) """ _warn_about_curr_date(curr_date, logger) _warn_about_disabled_safety_check(safety_check, logger) # If we didn't specify a condition, use all configured conditions if conditions is None: conditions = config.conditions # Query the job metadata table for all successful classification runs successful_job_runs = metadata.query_for_command_runs(metadata_conn, 'dirbs-classify', successful_only=True) if successful_job_runs and not disable_sanity_checks and not _perform_sanity_checks( config, successful_job_runs[0].extra_metadata): raise ClassifySanityCheckFailedException( 'Sanity checks failed, configurations are not identical to the last successful classification' ) logger.info('Classifying using conditions: {0}'.format(','.join( [c.label for c in conditions]))) # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, conditions=[c.as_dict() for c in conditions], operators=[op.as_dict() for op in config.region_config.operators], amnesty=config.amnesty_config.as_dict()) # Per-condition intermediate tables intermediate_tables = [] # Flag indicating whether we had a failure to change exit code had_errored_condition = False try: locked = False with conn, conn.cursor() as cursor: # Lock to prevent multiple simultaneous classifications cursor.execute('SELECT pg_try_advisory_lock(%s::BIGINT)', [hash_string_64bit('dirbs-classify')]) locked = cursor.fetchone()[0] if not locked: raise ClassifyLockException( 'Could not acquire lock for classification. ' 'Are there any other dirbs-classify instances running at the moment?' ) # Calculate total IMEI count if safety_check: logger.info( 'Counting number of IMEIs in network_imeis for safety check...' ) cursor.execute('SELECT COUNT(*) FROM network_imeis') total_imei_count = cursor.fetchone()[0] logger.info( 'Finished counting number of IMEIs in network_imeis for safety check' ) else: total_imei_count = -1 matched_imei_counts = {} nworkers = config.multiprocessing_config.max_db_connections condition_objs = [Condition(cond_config) for cond_config in conditions] with futures.ProcessPoolExecutor(max_workers=nworkers) as executor: logger.info( 'Simultaneously classifying {0:d} dimensions using up to {1:d} workers...' .format(len(conditions), nworkers)) calc_futures_to_condition = {} update_futures_to_condition = {} per_condition_state = defaultdict( lambda: dict(num_completed_calc_jobs=0, num_total_calc_jobs=0, num_completed_update_jobs=0, num_total_update_jobs=0, num_matched_imeis=0)) for c in condition_objs: # Make sure we record all temporary tables so that we can cleanup later intermediate_tables.append(c.intermediate_tbl_name(run_id)) # Queue the condition calculations and keep track for f in c.queue_calc_imeis_jobs(executor, config, run_id, curr_date): calc_futures_to_condition[f] = c per_condition_state[c.label]['num_total_calc_jobs'] += 1 # Process calculation futures for condition, job_state in _completed_calc_jobs( calc_futures_to_condition, per_condition_state, logger): max_ratio = condition.config.max_allowed_matching_ratio num_matched_imeis = job_state['num_matched_imeis'] max_matched_imeis = max_ratio * total_imei_count if safety_check and total_imei_count > 0 and num_matched_imeis > max_matched_imeis: ratio = min(num_matched_imeis / total_imei_count, 1) logger.error( 'Refusing to classify using condition \'{0}\': ' 'This condition matches more than the maximum number of IMEIs allowed by the ' 'condition\'s configuration ' '(matched_imeis={1:d}, ratio={2:f}, max_ratio={3:f})'. format(condition.label, num_matched_imeis, ratio, max_ratio)) had_errored_condition = True else: # Queue the classification state updates and keep track for f in condition.queue_update_classification_state_jobs( executor, config, run_id, curr_date): update_futures_to_condition[f] = condition per_condition_state[ condition.label]['num_total_update_jobs'] += 1 # Process update futures for condition, job_state in _completed_update_jobs( update_futures_to_condition, per_condition_state, logger): # Update metadata about matched IMEI counts every time each condition finishes matched_imei_counts[ condition.label] = job_state['num_matched_imeis'] metadata.add_optional_job_metadata( metadata_conn, command, run_id, matched_imei_counts=matched_imei_counts) # Output StatsD stats statsd.gauge( '{0}matched_imeis.{1}'.format(metrics_run_root, condition.label.lower()), job_state['num_matched_imeis']) finally: _do_final_cleanup(conn, logger, locked, intermediate_tables) # If we had an error condition, generate an error return code on exit if had_errored_condition: sys.exit(1)