示例#1
0
    def import_data(self):
        """Main import function that checks files and validates the contents before importing to the database."""
        assert self._was_entered and 'Attempting to import with an importer outside of a with statement!'
        try:
            # Store time so that we can track metrics for total listgen time
            st = time.time()

            # Store initial metadata
            metadata.add_optional_job_metadata(self._metadata_conn,
                                               'dirbs-import', self.import_id,
                                               **self._import_metadata)

            # Log initial message
            self._logger.info('Importing {0} data from file \'{1}\''.format(
                self._import_type, self._filename))

            # Init staging table (commit afterwards to ensure other processes can see table)
            with self._conn:
                self._time_component_perf('init_staging',
                                          self._init_staging_table)
                if self._supports_imei_shards:
                    self._time_component_perf('init_staging_shards',
                                              self._init_staging_table_shards)

            # Compute MD5 hash
            self._time_component_perf('compute_md5', self._compute_md5_hash)
            # Now do extract -> split -> preprocess -> prevalidate -> upload pipeline
            self._time_component_perf('upload_pipeline', self._upload_pipeline)
            # ANALYZE staging table after upload
            self._time_component_perf('analyze_staging',
                                      self._analyze_staging_table)
            # Run binary (yes/no) validation checks that operator on "raw" data (prior to post-processing)
            self._time_component_perf('validation_binary_checks_raw',
                                      self._validate_binary_checks_raw)
            # Post-process staging table
            self._time_component_perf('postprocess_staging',
                                      self._postprocess_staging_data)
            # Run binary (yes/no) validation checks
            self._time_component_perf('validation_binary_checks',
                                      self._validate_binary_checks)
            # Run row threshold validation checks
            self._time_component_perf('validation_threshold_checks',
                                      self._validate_threshold_checks)
            # Run validation checks based on historic data
            self._time_component_perf('validation_historical_checks',
                                      self._validate_historical_checks)
            # Copy data from the staging table
            rows_before = -1  # Sentinel value
            if self._need_previous_count_for_stats:
                rows_before = self.row_count
            rows_inserted, rows_updated, row_deleted = \
                self._time_component_perf('copy_from_staging', self._copy_staging_data)
            # Output import stats
            self._time_component_perf('output_stats', self._output_stats,
                                      rows_before, rows_inserted, rows_updated,
                                      row_deleted)

        finally:
            dt = int((time.time() - st) * 1000)
            self._log_normalized_import_time_metrics(dt)
示例#2
0
def process(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
            metrics_root, metrics_run_root):
    """Start whitelist processing job."""
    logger.info('Initiating Whitelist processing job...')

    operator_config = config.broker_config.operators
    kafka_config = config.broker_config.kafka
    h_consumer = create_kafka_consumer(logger, config)
    h_producer = create_kafka_producer(logger, config)

    # Store metadata
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       kafka={
                                           'host': kafka_config.hostname,
                                           'port': kafka_config.port,
                                           'topic': kafka_config.topic
                                       },
                                       operators=[{
                                           'operator': op.id,
                                           'topic': op.topic
                                       } for op in operator_config])

    whitelist_processing_job(consumer=h_consumer,
                             producer=h_producer,
                             operator_config=operator_config,
                             conn=conn,
                             logger=logger)
示例#3
0
 def _compute_md5_hash(self):
     """Method to compute the MD5 hash for the filename."""
     self._logger.info('Computing MD5 hash of the input file...')
     with open(self._filename, 'rb') as f:
         md5 = compute_md5_hash(f)
     self._logger.info('Computed MD5 hash of the input file')
     metadata.add_optional_job_metadata(self._metadata_conn,
                                        'dirbs-import',
                                        self.import_id,
                                        input_file_md5=md5)
示例#4
0
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
        metrics_root, metrics_run_root):
    """
    DIRBS script to catalog data files received by DIRBS Core.

    :param ctx: click commands context object
    :param config: dirbs config
    :param statsd: statsd instance
    :param logger: logger instance
    :param run_id: current run id of the job
    :param conn: database connection
    :param metadata_conn: database connection to store metadata
    :param command: job command
    :param metrics_root:
    :param metrics_run_root:
    """
    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        prospectors=config.catalog_config.prospectors,
        perform_prevalidation=config.catalog_config.perform_prevalidation)

    harvested_files = _harvest_files(config.catalog_config.prospectors, logger)
    logger.info('Fetching files in the existing data catalog...')
    cataloged_files = _fetch_catalog_files(config)
    logger.info('Found {0} file(s) in the existing catalog'.format(
        len(cataloged_files)))

    uncataloged_files = [
        x for x in harvested_files
        if x['file_properties'] not in cataloged_files
    ]
    logger.info('Discovered {0} new or modified file(s)'.format(
        len(uncataloged_files)))

    if len(uncataloged_files) > 0:
        logger.info(
            'Determining catalog attributes for the discovered files...')
        uncataloged_files = _populate_file_properties(
            config, uncataloged_files, run_id,
            config.catalog_config.perform_prevalidation, logger)
        logger.info(
            'Finished determining catalog attributes for the discovered files')
        logger.info('Updating data catalog with new or modified files...')
        _update_catalog(uncataloged_files, config)
        logger.info('Finished updating data catalog')
    else:
        logger.info('Data catalog is already up-to-date!')
示例#5
0
def non_active_pairs(ctx: callable, config: callable, statsd: callable,
                     logger: callable, run_id: int, conn: callable,
                     metadata_conn: callable, command: str,
                     metrics_root: callable, metrics_run_root: callable,
                     output_dir: str, period: int) -> None:
    """Generate list of Non-Active pairs over specified period.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        output_dir: output directory path
        period: period in days for a pair being count as not active (not active for these many days)
    Returns:
        None
    """
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))

    current_date = datetime.date.today()
    last_seen_date = datetime.date(
        current_date.year, current_date.month,
        current_date.day) - datetime.timedelta(period)
    logger.info(
        'List of None-Active Pairs with last_seen less than {0} will be generated'
        .format(last_seen_date))
    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_non_active_pairs(conn, logger, report_dir,
                                                 last_seen_date)

    statsd.gauge(
        '{0}runtime.per_report.non_active_pairs'.format(metrics_run_root),
        cp.duration)
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#6
0
def stolen_violations(ctx: callable, config: callable, statsd: callable,
                      logger: callable, run_id: int, conn: callable,
                      metadata_conn: callable, command: str,
                      metrics_root: callable, metrics_run_root: callable,
                      output_dir: str, newer_than: str,
                      filter_by_conditions: list) -> None:
    """Generate per-MNO list of IMEIs seen on the network after they were reported stolen.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        output_dir: output directory path
        newer_than: violation newer then this date
        filter_by_conditions: list of condition to filter by
    Returns:
        None
    """
    operators_configured_check(config, logger)
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))

    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_stolen_violations(config, logger, report_dir,
                                                  conn, filter_by_conditions,
                                                  newer_than)

    statsd.gauge(
        '{0}runtime.per_report.blacklist_violations_stolen'.format(
            metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#7
0
def condition_imei_overlaps(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root,
                            metrics_run_root, force_refresh, disable_retention_check, disable_data_check,
                            debug_query_performance, month, year, output_dir):
    """
    Generate per-condition reports showing matched IMEIs seen on more than one MNO network.

    :param ctx: current cli context
    :param config: dirbs config obj
    :param statsd: statsd obj
    :param logger: dirbs logger obj
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database metadata connection
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param force_refresh: force refresh flag
    :param disable_retention_check: retention check flag
    :param disable_data_check: data check flag
    :param debug_query_performance: debug query performance flag
    :param month: data month
    :param year: data year
    :param output_dir: output directory path
    """
    _reports_validation_checks(disable_retention_check, year, month, logger, config, conn,
                               disable_data_check)
    metadata.add_optional_job_metadata(metadata_conn, command, run_id,
                                       refreshed_data=force_refresh,
                                       month=month,
                                       year=year,
                                       report_schema_version=report_schema_version,
                                       output_dir=os.path.abspath(str(output_dir)))
    report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month)
    report_metadata = []

    with utils.CodeProfiler() as cp:
        country_name = config.region_config.name
        logger.info('Generating country per-condition IMEI overlap reports (classified IMEIs seen on more than '
                    'one MNO\'s network this month...')
        cond_names = [c.label for c in config.conditions]
        report_metadata.extend(_write_condition_imei_overlaps(conn, config, month, year, country_name,
                                                              report_dir, cond_names))
    statsd.gauge('{0}runtime.per_report.condition_imei_overlaps'.format(metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
示例#8
0
def unregistered_subscribers(ctx: callable, config: callable, statsd: callable,
                             logger: callable, run_id: int, conn: callable,
                             metadata_conn: callable, command: str,
                             metrics_root: callable,
                             metrics_run_root: callable, output_dir: str,
                             newer_than: str):
    """Generate per-MNO list of IMSIs that are not registered in subscribers list.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        output_dir: output directory path
        newer_than: violation newer then this date
    Returns:
        None
    """
    operators_configured_check(config, logger)
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_un_registered_subscribers(
            logger, config, report_dir, conn, newer_than)

    statsd.gauge(
        '{0}runtime.per_report.unregistered_subscribers'.format(
            metrics_run_root), cp.duration)

    # store metadata
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#9
0
def top_duplicates(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root,
                   force_refresh, disable_retention_check, disable_data_check, debug_query_performance,
                   month, year, output_dir):
    """
    Generate report listing IMEIs seen with more than 5 IMSIs in a given month and year.

    :param ctx: current cli context
    :param config: dirbs config obj
    :param statsd: statsd obj
    :param logger: dirbs logger obj
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database metadata connection
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param force_refresh: force refresh flag
    :param disable_retention_check: retention check flag
    :param disable_data_check: data check flag
    :param debug_query_performance: debug query performace flag
    :param month: data month
    :param year: data year
    :param output_dir: output directory path
    """
    _reports_validation_checks(disable_retention_check, year, month, logger, config, conn,
                               disable_data_check)
    metadata.add_optional_job_metadata(metadata_conn, command, run_id,
                                       refreshed_data=force_refresh,
                                       month=month,
                                       year=year,
                                       report_schema_version=report_schema_version,
                                       output_dir=os.path.abspath(str(output_dir)))
    report_metadata = []
    report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month)
    with utils.CodeProfiler() as cp:
        imsi_min_limit = 5
        country_name = config.region_config.name
        logger.info('Generating country duplicate IMEI report (IMEIs seen with more than {0:d} IMSIs this '
                    'reporting month)...'.format(imsi_min_limit))
        report_metadata.extend(_write_country_duplicates_report(conn, config, month, year, country_name,
                                                                report_dir, imsi_min_limit=imsi_min_limit))
    statsd.gauge('{0}runtime.per_report.top_duplicates'.format(metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
示例#10
0
def gsma_not_found(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root,
                   force_refresh, disable_retention_check, disable_data_check, debug_query_performance,
                   month, year, output_dir):
    """
    Generate report of all GSMA not found IMEIs.

    :param ctx: current cli context
    :param config: dirbs config obj
    :param statsd: statsd obj
    :param logger: dirbs logger obj
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database metadata connection
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param force_refresh: force refresh flag
    :param disable_retention_check: data retention check flag
    :param disable_data_check: data check flag
    :param debug_query_performance: query performance flag
    :param month: data month
    :param year: data year
    :param output_dir: output directory path
    """
    _reports_validation_checks(disable_retention_check, year, month, logger, config, conn,
                               disable_data_check)
    metadata.add_optional_job_metadata(metadata_conn, command, run_id,
                                       refreshed_data=force_refresh,
                                       month=month,
                                       year=year,
                                       report_schema_version=report_schema_version,
                                       output_dir=os.path.abspath(str(output_dir)))
    report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month)

    report_metadata = []

    with utils.CodeProfiler() as cp:
        logger.info('Generating country GSMA not found report...')
        country_name = config.region_config.name
        report_metadata.extend(_write_country_gsma_not_found_report(conn, config, month,
                                                                    year, country_name, report_dir))
    statsd.gauge('{0}runtime.per_report.gsma_not_found'.format(metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
示例#11
0
def blacklist_violations(ctx: callable, config: callable, statsd: callable,
                         logger: callable, run_id: int, conn: callable,
                         metadata_conn: callable, command: str,
                         metrics_root: callable, metrics_run_root: callable,
                         output_dir: str, month: int, year: int) -> None:
    """Generate per-operator blacklist violations.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        output_dir: output directory path
        month: reporting month
        year: reporting year
    Returns:
        None
    """
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_blacklist_violations(logger, config,
                                                     report_dir, conn, month,
                                                     year)
    statsd.gauge(
        '{0}runtime.per_report.blacklist_violation'.format(metrics_run_root),
        cp.duration)
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#12
0
def classified_triplets(ctx: callable, config: callable, statsd: callable,
                        logger: callable, run_id: int, conn: callable,
                        metadata_conn: callable, command: str,
                        metrics_root: callable, metrics_run_root: callable,
                        output_dir: str, conditions: list) -> None:
    """Generate per-condition classified triplets list.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        output_dir: output directory path
        conditions: list of conditions for classified triplets
    Returns:
        None
    """
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_classified_triplets(logger, conditions,
                                                    report_dir, conn)

    statsd.gauge(
        '{0}runtime.per_report.classified_triplets'.format(metrics_run_root),
        cp.duration)
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#13
0
def transient_msisdns(ctx: callable, config: callable, statsd: callable,
                      logger: callable, run_id: int, conn: callable,
                      metadata_conn: callable, command: str,
                      metrics_root: callable, metrics_run_root: callable,
                      output_dir: str, period: int, num_of_imeis: int,
                      current_date: str) -> None:
    """Generate list of MSISDNS used with possible transient IMEIs.

    Required Arguments:
        period: Analysis period in days (positive integer)
        num_of_imeis: Number of IMEIs a MSISDN must be seen with for analysis
    """
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_dir = make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        report_metadata = write_transient_msisdns(logger,
                                                  period,
                                                  report_dir,
                                                  conn,
                                                  config,
                                                  num_of_imeis,
                                                  current_date=current_date)

    statsd.gauge(
        '{0}runtime.per_report.transient_msisdns'.format(metrics_run_root),
        cp.duration)
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#14
0
def lists(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
          metrics_root, metrics_run_root):
    """Prune obsolete lists data."""
    curr_date = ctx.obj['CURR_DATE']

    # store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        retention_months=config.retention_config.months_retention)

    logger.info(
        'Pruning lists tables to remove any obsolete data with end_time outside the retention window..'
    )
    retention_months = config.retention_config.months_retention

    if curr_date is None:
        curr_date = datetime.date.today()

    first_month_to_drop = datetime.date(
        curr_date.year, curr_date.month,
        1) - relativedelta.relativedelta(months=retention_months)
    logger.info(
        'Lists data with end_time earlier than {0} will be pruned'.format(
            first_month_to_drop))

    with utils.db_role_setter(
            conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor:
        logger.debug('Calculating original number of rows in lists tables...')
        row_count_sql = sql.SQL(
            """SELECT blacklist_row_count, noft_lists_row_count, excp_lists_row_count
                                     FROM (SELECT COUNT(*)
                                             FROM blacklist) AS blacklist_row_count,
                                          (SELECT COUNT(*)
                                             FROM notifications_lists) AS noft_lists_row_count,
                                          (SELECT COUNT(*)
                                             FROM exceptions_lists) AS excp_lists_row_count"""
        )
        cursor.execute(row_count_sql)
        rows_before = cursor.fetchone()
        blacklist_rows_before = int(
            rows_before.blacklist_row_count.strip('()'))
        notflist_rows_before = int(
            rows_before.noft_lists_row_count.strip('()'))
        excplist_rows_before = int(
            rows_before.excp_lists_row_count.strip('()'))
        rows_before = blacklist_rows_before + notflist_rows_before + excplist_rows_before
        logger.debug('Calculated original number of rows in lists tables...')
        statsd.gauge('{0}blacklist_rows_before'.format(metrics_run_root),
                     blacklist_rows_before)
        statsd.gauge(
            '{0}notifications_lists_rows_before'.format(metrics_run_root),
            notflist_rows_before)
        statsd.gauge(
            '{0}exceptions_lists_rows_before'.format(metrics_run_root),
            excplist_rows_before)
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            blacklist_rows_before=blacklist_rows_before,
            notifications_lists_rows_before=notflist_rows_before,
            exceptions_lists_rows_before=excplist_rows_before)

        # Calculate number of rows in the lists table outside the retention window
        job_metadata_filter_sql = """SELECT run_id
                                       FROM job_metadata
                                      WHERE command = 'dirbs-listgen'
                                        AND end_time < '{0}'""".format(
            first_month_to_drop)

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM blacklist
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_bl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of blacklist table outside the retention window to prune'
            .format(total_bl_rows_out_window_to_prune))

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM notifications_lists
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_nl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of notifications lists table outside the retention window to prune'
            .format(total_nl_rows_out_window_to_prune))

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM exceptions_lists
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_nl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of exceptions lists table outside the retention window to prune'
            .format(total_nl_rows_out_window_to_prune))

        # We repartition the tables to re-create them, passing a condition sql
        logger.debug('Re-creating blacklist table...')
        num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn)
        src_filter_sql = cursor.mogrify(
            """WHERE start_run_id NOT IN ({0})""".format(
                job_metadata_filter_sql))
        partition_utils.repartition_blacklist(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created blacklist table')

        logger.debug('Re-creating notifications lists table...')
        partition_utils.repartition_notifications_lists(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created notifications lists table')

        logger.debug('Re-creating exceptions lists table...')
        partition_utils.repartition_exceptions_lists(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created exceptions lists table')

        logger.debug('Calculating new number of rows in lists tables...')
        cursor.execute(row_count_sql)
        rows_after = cursor.fetchone()
        blacklist_rows_after = int(rows_after.blacklist_row_count.strip('()'))
        notflist_rows_after = int(rows_after.noft_lists_row_count.strip('()'))
        excplist_rows_after = int(rows_after.excp_lists_row_count.strip('()'))
        rows_after = blacklist_rows_after + notflist_rows_after + excplist_rows_after
        logger.debug('Calculated new number of rows in lists tables')
        statsd.gauge('{0}blacklist_rows_after'.format(metrics_run_root),
                     blacklist_rows_after)
        statsd.gauge(
            '{0}notifications_lists_rows_after'.format(metrics_run_root),
            notflist_rows_after)
        statsd.gauge('{0}exceptions_lists_rows_after'.format(metrics_run_root),
                     excplist_rows_after)
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            blacklist_rows_before=blacklist_rows_after,
            notifications_lists_rows_before=notflist_rows_after,
            exceptions_lists_rows_before=excplist_rows_after)
        logger.info('Pruned {0:d} rows from lists tables'.format(rows_after -
                                                                 rows_before))
示例#15
0
def classification_state(ctx, config, statsd, logger, run_id, conn,
                         metadata_conn, command, metrics_root,
                         metrics_run_root):
    """Prune obsolete classification_state data."""
    curr_date = ctx.obj['CURR_DATE']

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        retention_months=config.retention_config.months_retention)

    logger.info(
        'Pruning classification_state table to remove any classification state data related to '
        'obsolete conditions and data with end_date outside the retention window..'
    )

    cond_config_list = [c.label for c in config.conditions]
    retention_months = config.retention_config.months_retention

    if curr_date is None:
        curr_date = datetime.date.today()

    first_month_to_drop = datetime.date(
        curr_date.year, curr_date.month,
        1) - relativedelta.relativedelta(months=retention_months)
    logger.info(
        'Classification state data with end_date earlier than {0} will be '
        'pruned'.format(first_month_to_drop))

    with utils.db_role_setter(
            conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor:
        logger.debug(
            'Calculating original number of rows in classification_state table...'
        )
        cursor.execute('SELECT COUNT(*) FROM classification_state')
        rows_before = cursor.fetchone()[0]
        logger.debug(
            'Calculated original number of rows in classification_state table')
        statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before)
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_before=rows_before)

        # Calculate number of rows in the classification table outside retention window
        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM classification_state
                                   WHERE end_date < %s """),
            [first_month_to_drop])
        total_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of classification_state table '
            'with end_date outside the retention window to prune.'.format(
                total_rows_out_window_to_prune))

        # Calculate number of rows in the classification with conditions no longer existing
        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM classification_state
                                   WHERE NOT starts_with_prefix(cond_name, %s)"""
                    ), [cond_config_list])
        total_rows_no_cond_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of classification_state table with conditions '
            'no longer existing to prune.'.format(total_rows_no_cond_to_prune))

        logger.debug('Re-creating classification_state table...')
        # Basically, we just re-partition the classification_state table to re-create it, passing a src_filter_sql
        # parameter
        num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn)
        src_filter_sql = cursor.mogrify(
            """WHERE (end_date > %s
                                              OR end_date IS NULL)
                                             AND cond_name LIKE ANY(%s)""",
            [first_month_to_drop, cond_config_list])
        partition_utils.repartition_classification_state(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created classification_state table')

        logger.debug(
            'Calculating new number of rows in classification_state table...')
        cursor.execute('SELECT COUNT(*) FROM classification_state')
        rows_after = cursor.fetchone()[0]
        logger.debug(
            'Calculated new number of rows in classification_state table')
        statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after)
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_after=rows_after)

        logger.info('Pruned {0:d} rows from classification_state table'.format(
            rows_after - rows_before))
示例#16
0
def blacklist(ctx, config, statsd, logger, run_id, conn, metadata_conn,
              command, metrics_root, metrics_run_root, condition_name,
              prune_all):
    """Expire IMEIs outside the blacklist retention period from blacklist."""
    current_date = datetime.date.today()
    retention_days = config.retention_config.blacklist_retention

    if condition_name is None and prune_all is False:
        logger.info(
            'Error: one of the arguments "condition_name" or "--prune-all" is required'
        )
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    elif condition_name is not None and prune_all is True:
        logger.info(
            'Error: only one of the arguments "condition_name" or "--prune-all" is required'
        )
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    elif retention_days == 0:
        logger.info(
            'Blacklist will not be prune, as retention value is set to {0}'.
            format(retention_days))
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    else:
        _warn_about_prune_all(prune_all, logger)
        logger.info(
            'Pruning blacklist to remove any data related to specified condition '
            'outside the retention window.')
        last_retention_date = datetime.date(
            current_date.year, current_date.month,
            current_date.day) - datetime.timedelta(retention_days)

        # store metadata
        logger.info(
            'Blacklist entries with start_date earlier than {0} will be pruned'
            .format(last_retention_date))
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            curr_date=current_date.isoformat(),
            retention_days=retention_days,
            job_executed=True,
            last_retention_date=last_retention_date.isoformat())

        with utils.db_role_setter(
                conn,
                role_name='dirbs_core_power_user'), conn.cursor() as cursor:
            logger.debug(
                'Calculating original number of rows with block_date in classification_state table...'
            )

            cursor.execute("""SELECT COUNT(*)
                                FROM classification_state
                               WHERE block_date IS NOT NULL
                                 AND end_date IS NULL""")
            rows_before = cursor.fetchone()[0]

            logger.debug(
                'Calculated original number of rows (having block_date) in classification_state table'
            )
            statsd.gauge('{0}rows_before'.format(metrics_run_root),
                         rows_before)
            metadata.add_optional_job_metadata(metadata_conn,
                                               command,
                                               run_id,
                                               rows_before=rows_before)

            # if its a condition based pruning
            if not prune_all:
                cursor.execute(
                    sql.SQL("""SELECT COUNT(*)
                                            FROM classification_state
                                           WHERE start_date < %s
                                             AND cond_name = %s
                                             AND end_date IS NULL
                                             AND block_date IS NOT NULL"""),
                    [last_retention_date, condition_name[0].label])
                total_rows_to_prune = cursor.fetchone()[0]

                logger.info(
                    'Found {0:d} rows of classification_state table '
                    'with start_date for {1} dimension outside the blacklist '
                    'retention window.'.format(total_rows_to_prune,
                                               condition_name[0].label))

                if total_rows_to_prune > 0:
                    cursor.execute(
                        sql.SQL("""UPDATE classification_state
                                                 SET end_date = '{0}'
                                               WHERE start_date < '{1}'
                                                 AND cond_name = '{2}'
                                                 AND end_date IS NULL
                                                 AND block_date IS NOT NULL""".
                                format(current_date.isoformat(),
                                       last_retention_date,
                                       condition_name[0].label)))

                logger.info(
                    'Pruned {0:d} rows from blacklist for {1} dimension'.
                    format(total_rows_to_prune, condition_name[0].label))

            # prune without any condition
            else:
                cursor.execute(
                    sql.SQL("""SELECT COUNT(*)
                                            FROM classification_state
                                           WHERE start_date < %s
                                             AND end_date IS NULL
                                             AND block_date IS NOT NULL"""),
                    [last_retention_date])
                total_rows_to_prune = cursor.fetchone()[0]

                logger.info(
                    'Found {0:d} rows of classification_state table '
                    'with start_date outside the blacklist retention window.'.
                    format(total_rows_to_prune))

                if total_rows_to_prune > 0:
                    cursor.execute(
                        sql.SQL("""UPDATE classification_state
                                                 SET end_date = '{0}'
                                               WHERE start_date < '{1}'
                                                 AND end_date IS NULL
                                                 AND block_date IS NOT NULL""".
                                format(current_date.isoformat(),
                                       last_retention_date)))
                logger.info('Pruned {0:d} rows from blacklist'.format(
                    total_rows_to_prune))

            logger.debug(
                'Calculating remaining number of rows with block_date (end_date is null) '
                'in classification_state table...')
            cursor.execute("""SELECT COUNT(*)
                                FROM classification_state
                               WHERE block_date IS NOT NULL
                                 AND end_date IS NULL""")
            rows_after = cursor.fetchone()[0]

            logger.debug(
                'Calculated remaining number of rows (having block_date and end_date null) '
                'in classification_state table')
            statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after)
            metadata.add_optional_job_metadata(metadata_conn,
                                               command,
                                               run_id,
                                               rows_after=rows_after)
示例#17
0
def top_duplicates(ctx: callable, config: callable, statsd: callable,
                   logger: callable, run_id: int, conn: callable,
                   metadata_conn: callable, command: str,
                   metrics_root: callable, metrics_run_root: callable,
                   force_refresh: bool, disable_retention_check: bool,
                   disable_data_check: bool, debug_query_performance: bool,
                   month: int, year: int, output_dir: str) -> None:
    """Generate report listing IMEIs seen with more than 5 IMSIs in a given month and year.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        force_refresh: bool to force writing/generating reports from scratch
        disable_retention_check: bool to disable data retention check
        disable_data_check: bool to disable data check
        debug_query_performance: bool to debug query performance
        month: reporting month
        year: reporting year
        output_dir: output directory path
    Returns:
        None
    """
    reports_validation_checks(disable_retention_check, year, month, logger,
                              config, conn, disable_data_check)
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        refreshed_data=force_refresh,
        month=month,
        year=year,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_metadata = []
    report_dir = make_report_directory(ctx,
                                       output_dir,
                                       run_id,
                                       conn,
                                       config,
                                       year=year,
                                       month=month)
    with utils.CodeProfiler() as cp:
        imsi_min_limit = 5
        country_name = config.region_config.name
        logger.info(
            'Generating country duplicate IMEI report (IMEIs seen with more than {0:d} IMSIs this '
            'reporting month)...'.format(imsi_min_limit))
        report_metadata.extend(
            write_country_duplicates_report(conn,
                                            config,
                                            month,
                                            year,
                                            country_name,
                                            report_dir,
                                            imsi_min_limit=imsi_min_limit))
    statsd.gauge(
        '{0}runtime.per_report.top_duplicates'.format(metrics_run_root),
        cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#18
0
def condition_imei_overlaps(ctx: callable, config: callable, statsd: callable,
                            logger: callable, run_id: int, conn: callable,
                            metadata_conn: callable, command: str,
                            metrics_root: callable, metrics_run_root: callable,
                            force_refresh: bool, disable_retention_check: bool,
                            disable_data_check: bool,
                            debug_query_performance: bool, month: int,
                            year: int, output_dir: str):
    """Generate per-condition reports showing matched IMEIs seen on more than one MNO network.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        force_refresh: bool to force writing/generating reports from scratch
        disable_retention_check: bool to disable data retention check
        disable_data_check: bool to disable data check
        debug_query_performance: bool to debug query performance
        month: reporting month
        year: reporting year
        output_dir: output directory path
    Returns:
        None
    """
    reports_validation_checks(disable_retention_check, year, month, logger,
                              config, conn, disable_data_check)
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        refreshed_data=force_refresh,
        month=month,
        year=year,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))
    report_dir = make_report_directory(ctx,
                                       output_dir,
                                       run_id,
                                       conn,
                                       config,
                                       year=year,
                                       month=month)
    report_metadata = []

    with utils.CodeProfiler() as cp:
        country_name = config.region_config.name
        logger.info(
            'Generating country per-condition IMEI overlap reports (classified IMEIs seen on more than '
            "one MNO\'s network this month...")
        cond_names = [c.label for c in config.conditions]
        report_metadata.extend(
            write_condition_imei_overlaps(conn, config, month, year,
                                          country_name, report_dir,
                                          cond_names))
    statsd.gauge(
        '{0}runtime.per_report.condition_imei_overlaps'.format(
            metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#19
0
def stolen_violations(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root,
                      metrics_run_root, output_dir, newer_than, filter_by_conditions):
    """
    Generate per-MNO list of IMEIs seen on the network after they were reported stolen.

    :param ctx: current cli context
    :param config: dirbs config obj
    :param statsd: statsd obj
    :param logger: dirbs logger obj
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: metadata database connection
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param output_dir: output directory path
    :param newer_than: newer than flag
    :param filter_by_conditions: flag
    """
    _operators_configured_check(config, logger)
    metadata.add_optional_job_metadata(metadata_conn, command, run_id,
                                       report_schema_version=report_schema_version,
                                       output_dir=os.path.abspath(str(output_dir)))

    report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config)

    with utils.CodeProfiler() as cp:
        logger.info('Generating per-MNO stolen list violations reports...')
        with contextlib.ExitStack() as stack:
            # Push files into exit stack so that they will all be closed.
            operator_ids = [o.id for o in config.region_config.operators]
            filename_op_map = {'stolen_violations_{0}.csv'.format(o): o for o in operator_ids}
            opname_file_map = {o: stack.enter_context(open(os.path.join(report_dir, fn), 'w', encoding='utf8'))
                               for fn, o in filename_op_map.items()}
            # Create a map from operator name to csv writer
            opname_csvwriter_map = {o: csv.writer(opname_file_map[o]) for o in operator_ids}
            # Write the header to each csvwriter
            for _, writer in opname_csvwriter_map.items():
                writer.writerow(['imei_norm', 'last_seen', 'reporting_date'])

            # Run a query to find all the classified IMEIs seen on multiple operators
            blacklist_violations_grace_period_days = config.report_config.blacklist_violations_grace_period_days
            with conn.cursor() as cursor:
                query = sql.SQL("""SELECT imei_norm, last_seen, reporting_date, operator_id
                                     FROM (SELECT imei_norm, MIN(reporting_date) AS reporting_date
                                             FROM stolen_list
                                         GROUP BY imei_norm) AS stolen_imeis
                                     JOIN LATERAL (
                                           SELECT imei_norm, operator_id, MAX(last_seen) AS last_seen
                                             FROM monthly_network_triplets_per_mno_no_null_imeis nt
                                            WHERE imei_norm = stolen_imeis.imei_norm
                                              AND virt_imei_shard = calc_virt_imei_shard(stolen_imeis.imei_norm)
                                         GROUP BY imei_norm, operator_id) network_imeis
                                    USING (imei_norm)
                                    WHERE network_imeis.last_seen > stolen_imeis.reporting_date + %s
                                          {0}
                                          {1}""")

                if filter_by_conditions:
                    cond_filter_query = """AND EXISTS(SELECT 1
                                                        FROM classification_state
                                                       WHERE imei_norm = stolen_imeis.imei_norm
                                                         AND virt_imei_shard =
                                                                calc_virt_imei_shard(stolen_imeis.imei_norm)
                                                         AND cond_name IN %s
                                                         AND end_date IS NULL)"""
                    sql_bytes = cursor.mogrify(cond_filter_query, [tuple([c.label for c in filter_by_conditions])])
                    conditions_filter_sql = sql.SQL(str(sql_bytes, conn.encoding))
                else:
                    conditions_filter_sql = sql.SQL('')

                if newer_than:
                    newer_than_query = 'AND last_seen > %s'
                    sql_bytes = cursor.mogrify(newer_than_query, [newer_than])
                    date_filter_sql = sql.SQL(str(sql_bytes, conn.encoding))
                else:
                    date_filter_sql = sql.SQL('')

                cursor.execute(query.format(conditions_filter_sql, date_filter_sql),
                               [blacklist_violations_grace_period_days])
                for res in cursor:
                    opname_csvwriter_map[res.operator_id].writerow([res.imei_norm, res.last_seen.strftime('%Y%m%d'),
                                                                    res.reporting_date.strftime('%Y%m%d')])

        report_metadata = _gen_metadata_for_reports(list(filename_op_map.keys()), report_dir)

    statsd.gauge('{0}runtime.per_report.blacklist_violations_stolen'.format(metrics_run_root), cp.duration)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
示例#20
0
def triplets(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
             metrics_root, metrics_run_root):
    """Prune old monthly_network_triplets data."""
    curr_date = ctx.obj['CURR_DATE']

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        retention_months=config.retention_config.months_retention)

    if curr_date is None:
        curr_date = datetime.date.today()

    with conn.cursor() as cursor:
        logger.info(
            'Pruning monthly_network_triplets data outside the retention window from database...'
        )
        retention_months = config.retention_config.months_retention
        first_month_to_drop = datetime.date(
            curr_date.year, curr_date.month,
            1) - relativedelta.relativedelta(months=retention_months)
        logger.info(
            'monthly_network_triplets partitions older than {0} will be pruned'
            .format(first_month_to_drop))

        country_monthly_partitions = utils.child_table_names(
            conn, 'monthly_network_triplets_country')
        operator_partitions = utils.child_table_names(
            conn, 'monthly_network_triplets_per_mno')
        operator_monthly_partitions = []
        for op_partition in operator_partitions:
            operator_monthly_partitions.extend(
                utils.child_table_names(conn, op_partition))

        parent_tbl_names = [
            'monthly_network_triplets_country',
            'monthly_network_triplets_per_mno'
        ]
        rows_before = {}
        for tbl in parent_tbl_names:
            logger.debug(
                'Calculating original number of rows in {0} table...'.format(
                    tbl))
            cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl)))
            rows_before[tbl] = cursor.fetchone()[0]
            logger.debug(
                'Calculated original number of rows in {0} table'.format(tbl))
            statsd.gauge('{0}.{1}.rows_before'.format(metrics_run_root, tbl),
                         rows_before[tbl])
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_before=rows_before)

        total_rows_pruned = 0
        total_partitions = country_monthly_partitions + operator_monthly_partitions
        for tblname in total_partitions:
            invariants_list = utils.table_invariants_list(
                conn, [tblname], ['triplet_month', 'triplet_year'])
            assert len(invariants_list) <= 1
            if len(invariants_list) == 0:
                logger.warning(
                    'Found empty partition {0}. Dropping...'.format(tblname))
                cursor.execute(
                    sql.SQL("""DROP TABLE {0} CASCADE""").format(
                        sql.Identifier(tblname)))
            else:
                month, year = tuple(invariants_list[0])

                # Check if table year/month is outside the retention window
                if (datetime.date(year, month, 1) < first_month_to_drop):
                    # Calculate number of rows in the partition table
                    cursor.execute(
                        sql.SQL("""SELECT COUNT(*) FROM {0}""").format(
                            sql.Identifier(tblname)))
                    partition_table_rows = cursor.fetchone()[0]
                    total_rows_pruned += partition_table_rows

                    logger.info('Dropping table {0} with {1} rows...'.format(
                        tblname, partition_table_rows))
                    cursor.execute(
                        sql.SQL("""DROP TABLE {0} CASCADE""").format(
                            sql.Identifier(tblname)))
                    logger.info('Dropped table {0}'.format(tblname))

        rows_after = {}
        for tbl in parent_tbl_names:
            logger.debug(
                'Calculating new number of rows in {0} table...'.format(tbl))
            cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl)))
            rows_after[tbl] = cursor.fetchone()[0]
            logger.debug(
                'Calculated new number of rows in {0} table'.format(tbl))
            statsd.gauge('{0}.{1}.rows_after'.format(metrics_run_root, tbl),
                         rows_after[tbl])
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_after=rows_after)

        total_rows_before = sum(rows_before.values())
        total_rows_after = sum(rows_after.values())

        assert (total_rows_before - total_rows_after) == total_rows_pruned
        logger.info(
            'Pruned {0:d} rows of monthly_network_triplets data outside the retention window from database'
            .format(total_rows_pruned))
示例#21
0
def standard(ctx: callable, config: callable, statsd: callable,
             logger: callable, run_id: int, conn: callable,
             metadata_conn: callable, command: str, metrics_root: callable,
             metrics_run_root: callable, force_refresh: bool,
             disable_retention_check: bool, disable_data_check: bool,
             debug_query_performance: bool, month: int, year: int,
             output_dir: str) -> None:
    """Generate standard monthly operator and country-level reports.

    Arguments:
        ctx: click context object
        config: DIRBS config object
        statsd: DIRBS statsd connection object
        logger: DIRBS custom logger object
        run_id: run id of the current job
        conn: DIRBS PostgreSQL connection object
        metadata_conn: DIRBS PostgreSQL metadata connection object
        command: name of the command
        metrics_root: root object for the statsd metrics
        metrics_run_root: root object for the statsd run metrics
        force_refresh: bool to force writing/generating reports from scratch
        disable_retention_check: bool to disable data retention check
        disable_data_check: bool to disable data check
        debug_query_performance: bool to debug query performance
        month: reporting month
        year: reporting year
        output_dir: output directory path
    Returns:
        None
    """
    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        refreshed_data=force_refresh,
        month=month,
        year=year,
        report_schema_version=report_schema_version,
        output_dir=os.path.abspath(str(output_dir)))

    reports_validation_checks(disable_retention_check, year, month, logger,
                              config, conn, disable_data_check)

    # Next, generate all the report data so that report generation can happen very quickly
    data_id, class_run_id, per_tac_compliance_data = generate_monthly_report_stats(
        config, conn, month, year, statsd, metrics_run_root, run_id,
        force_refresh, debug_query_performance)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       data_id=data_id,
                                       classification_run_id=class_run_id)

    report_dir = make_report_directory(ctx,
                                       output_dir,
                                       run_id,
                                       conn,
                                       config,
                                       class_run_id=class_run_id,
                                       year=year,
                                       month=month,
                                       data_id=data_id)

    # First, copy all the report JS/CSS files into the output directory in
    # cachebusted form and get the cachebusted filenames
    asset_map = {}
    report_assets = ['js/report.js', 'css/report.css']

    for fn in report_assets:
        logger.info('Copying required asset "%s" to report folder', fn)
        asset = pkgutil.get_data('dirbs', fn)
        name, ext = fn.split('/')[-1].split('.')
        filename = '{0}_{1}.{2}'.format(
            name, utils.cachebusted_filename_from_contents(asset), ext)
        asset_map[fn] = filename
        with open(os.path.join(report_dir, filename), 'wb') as of:
            of.write(asset)

    js_filename = asset_map['js/report.js']
    css_filename = asset_map['css/report.css']

    # Next, generate the country level report
    report_metadata = []
    with utils.CodeProfiler() as cp:
        logger.info('Generating country report...')
        country_name = config.region_config.name
        country_per_tac_compliance_data = None
        if per_tac_compliance_data is not None:
            country_per_tac_compliance_data = per_tac_compliance_data[
                OperatorConfig.COUNTRY_OPERATOR_NAME]
        report = CountryReport(
            conn,
            data_id,
            config,
            month,
            year,
            country_name,
            has_compliance_data=country_per_tac_compliance_data is not None)
        report_metadata.extend(
            write_report(report, month, year, report_dir, country_name,
                         css_filename, js_filename,
                         country_per_tac_compliance_data))

    statsd.gauge('{0}runtime.per_report.country'.format(metrics_run_root),
                 cp.duration)
    operators = config.region_config.operators
    # Finally, generate the operator reports
    for op in operators:
        with utils.CodeProfiler() as cp:
            logger.info('Generating operator report for operator ID %s...',
                        op.id)
            operator_per_tac_compliance_data = None
            if per_tac_compliance_data is not None:
                operator_per_tac_compliance_data = per_tac_compliance_data.get(
                    op.id)
            report = OperatorReport(
                conn,
                data_id,
                config,
                month,
                year,
                op,
                has_compliance_data=operator_per_tac_compliance_data
                is not None)
            report_prefix = '{0}_{1}'.format(country_name, op.id)
            report_metadata.extend(
                write_report(report, month, year, report_dir, report_prefix,
                             css_filename, js_filename,
                             operator_per_tac_compliance_data))
        statsd.gauge(
            '{0}runtime.per_report.operators.{1}'.format(
                metrics_run_root, op.id), cp.duration)

    # Store per-report job metadata
    metadata.add_optional_job_metadata(metadata_conn,
                                       command,
                                       run_id,
                                       report_outputs=report_metadata)
示例#22
0
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
        metrics_root, metrics_run_root, conditions, safety_check, curr_date,
        disable_sanity_checks):
    """
    DIRBS script to classify IMEIs.

    Iterates through all configured conditions and write to the classification_state table.

    :param ctx: click command context
    :param config: dirbs config instance
    :param statsd: statsd instance
    :param logger: dirbs logger instance
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database connection for job metadata
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param conditions: list of user supplied conditions
    :param safety_check: bool (enable/disable safety check)
    :param curr_date: date to use for classification
    :param disable_sanity_checks: bool (enable/disable sanity checks)
    """
    _warn_about_curr_date(curr_date, logger)
    _warn_about_disabled_safety_check(safety_check, logger)

    # If we didn't specify a condition, use all configured conditions
    if conditions is None:
        conditions = config.conditions

    # Query the job metadata table for all successful classification runs
    successful_job_runs = metadata.query_for_command_runs(metadata_conn,
                                                          'dirbs-classify',
                                                          successful_only=True)
    if successful_job_runs and not disable_sanity_checks and not _perform_sanity_checks(
            config, successful_job_runs[0].extra_metadata):
        raise ClassifySanityCheckFailedException(
            'Sanity checks failed, configurations are not identical to the last successful classification'
        )

    logger.info('Classifying using conditions: {0}'.format(','.join(
        [c.label for c in conditions])))

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        conditions=[c.as_dict() for c in conditions],
        operators=[op.as_dict() for op in config.region_config.operators],
        amnesty=config.amnesty_config.as_dict())

    # Per-condition intermediate tables
    intermediate_tables = []

    # Flag indicating whether we had a failure to change exit code
    had_errored_condition = False

    try:
        locked = False
        with conn, conn.cursor() as cursor:
            # Lock to prevent multiple simultaneous classifications
            cursor.execute('SELECT pg_try_advisory_lock(%s::BIGINT)',
                           [hash_string_64bit('dirbs-classify')])
            locked = cursor.fetchone()[0]
            if not locked:
                raise ClassifyLockException(
                    'Could not acquire lock for classification. '
                    'Are there any other dirbs-classify instances running at the moment?'
                )

            # Calculate total IMEI count
            if safety_check:
                logger.info(
                    'Counting number of IMEIs in network_imeis for safety check...'
                )
                cursor.execute('SELECT COUNT(*) FROM network_imeis')
                total_imei_count = cursor.fetchone()[0]
                logger.info(
                    'Finished counting number of IMEIs in network_imeis for safety check'
                )
            else:
                total_imei_count = -1

        matched_imei_counts = {}
        nworkers = config.multiprocessing_config.max_db_connections
        condition_objs = [Condition(cond_config) for cond_config in conditions]

        with futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
            logger.info(
                'Simultaneously classifying {0:d} dimensions using up to {1:d} workers...'
                .format(len(conditions), nworkers))

            calc_futures_to_condition = {}
            update_futures_to_condition = {}
            per_condition_state = defaultdict(
                lambda: dict(num_completed_calc_jobs=0,
                             num_total_calc_jobs=0,
                             num_completed_update_jobs=0,
                             num_total_update_jobs=0,
                             num_matched_imeis=0))
            for c in condition_objs:
                # Make sure we record all temporary tables so that we can cleanup later
                intermediate_tables.append(c.intermediate_tbl_name(run_id))
                # Queue the condition calculations and keep track
                for f in c.queue_calc_imeis_jobs(executor, config, run_id,
                                                 curr_date):
                    calc_futures_to_condition[f] = c
                    per_condition_state[c.label]['num_total_calc_jobs'] += 1

            # Process calculation futures
            for condition, job_state in _completed_calc_jobs(
                    calc_futures_to_condition, per_condition_state, logger):
                max_ratio = condition.config.max_allowed_matching_ratio
                num_matched_imeis = job_state['num_matched_imeis']
                max_matched_imeis = max_ratio * total_imei_count
                if safety_check and total_imei_count > 0 and num_matched_imeis > max_matched_imeis:
                    ratio = min(num_matched_imeis / total_imei_count, 1)
                    logger.error(
                        'Refusing to classify using condition \'{0}\': '
                        'This condition matches more than the maximum number of IMEIs allowed by the '
                        'condition\'s configuration '
                        '(matched_imeis={1:d}, ratio={2:f}, max_ratio={3:f})'.
                        format(condition.label, num_matched_imeis, ratio,
                               max_ratio))
                    had_errored_condition = True
                else:
                    # Queue the classification state updates and keep track
                    for f in condition.queue_update_classification_state_jobs(
                            executor, config, run_id, curr_date):
                        update_futures_to_condition[f] = condition
                        per_condition_state[
                            condition.label]['num_total_update_jobs'] += 1

            # Process update futures
            for condition, job_state in _completed_update_jobs(
                    update_futures_to_condition, per_condition_state, logger):
                # Update metadata about matched IMEI counts every time each condition finishes
                matched_imei_counts[
                    condition.label] = job_state['num_matched_imeis']
                metadata.add_optional_job_metadata(
                    metadata_conn,
                    command,
                    run_id,
                    matched_imei_counts=matched_imei_counts)
                # Output StatsD stats
                statsd.gauge(
                    '{0}matched_imeis.{1}'.format(metrics_run_root,
                                                  condition.label.lower()),
                    job_state['num_matched_imeis'])

    finally:
        _do_final_cleanup(conn, logger, locked, intermediate_tables)

        # If we had an error condition, generate an error return code on exit
        if had_errored_condition:
            sys.exit(1)
示例#23
0
def standard(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root,
             force_refresh, disable_retention_check, disable_data_check, debug_query_performance,
             month, year, output_dir):
    """
    Generate standard monthly operator and country-level reports.

    :param ctx: current cli context
    :param config: dirbs config obj
    :param statsd: statsd obj
    :param logger: dirbs logger obj
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database metadata connection
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param force_refresh: force fresh flag
    :param disable_retention_check: retention check flag
    :param disable_data_check: data check flag
    :param debug_query_performance: query performance flag
    :param month: data month
    :param year: data year
    :param output_dir: output directory path
    """
    # Store metadata
    metadata.add_optional_job_metadata(metadata_conn, command, run_id,
                                       refreshed_data=force_refresh,
                                       month=month,
                                       year=year,
                                       report_schema_version=report_schema_version,
                                       output_dir=os.path.abspath(str(output_dir)))

    _reports_validation_checks(disable_retention_check, year, month, logger, config, conn,
                               disable_data_check)

    # Next, generate all the report data so that report generation can happen very quickly
    data_id, class_run_id, per_tac_compliance_data = generate_monthly_report_stats(config, conn, month, year,
                                                                                   statsd, metrics_run_root,
                                                                                   run_id,
                                                                                   force_refresh,
                                                                                   debug_query_performance)

    # Store metadata about the report data ID and classification run ID
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, data_id=data_id,
                                       classification_run_id=class_run_id)

    report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, class_run_id=class_run_id,
                                        year=year, month=month, data_id=data_id)

    # First, copy all the report JS/CSS files into the output directory in
    # cachebusted form and get the cachebusted filenames
    asset_map = {}
    report_assets = [
        'js/report.js',
        'css/report.css'
    ]

    for fn in report_assets:
        logger.info('Copying required asset "%s" to report folder', fn)
        asset = pkgutil.get_data('dirbs', fn)
        name, ext = fn.split('/')[-1].split('.')
        filename = '{0}_{1}.{2}'.format(name, utils.cachebusted_filename_from_contents(asset), ext)
        asset_map[fn] = filename
        with open(os.path.join(report_dir, filename), 'wb') as of:
            of.write(asset)

    js_filename = asset_map['js/report.js']
    css_filename = asset_map['css/report.css']

    # Next, generate the country level report
    report_metadata = []
    with utils.CodeProfiler() as cp:
        logger.info('Generating country report...')
        country_name = config.region_config.name
        country_per_tac_compliance_data = None
        if per_tac_compliance_data is not None:
            country_per_tac_compliance_data = per_tac_compliance_data[OperatorConfig.COUNTRY_OPERATOR_NAME]
        report = CountryReport(conn, data_id, config, month, year, country_name,
                               has_compliance_data=country_per_tac_compliance_data is not None)
        report_metadata.extend(_write_report(report, month, year, report_dir, country_name,
                                             css_filename, js_filename, country_per_tac_compliance_data))

    statsd.gauge('{0}runtime.per_report.country'.format(metrics_run_root), cp.duration)
    operators = config.region_config.operators
    # Finally, generate the operator reports
    for op in operators:
        with utils.CodeProfiler() as cp:
            logger.info('Generating operator report for operator ID %s...', op.id)
            operator_per_tac_compliance_data = None
            if per_tac_compliance_data is not None:
                operator_per_tac_compliance_data = per_tac_compliance_data.get(op.id)
            report = OperatorReport(conn, data_id, config, month, year, op,
                                    has_compliance_data=operator_per_tac_compliance_data is not None)
            report_prefix = '{0}_{1}'.format(country_name, op.id)
            report_metadata.extend(_write_report(report, month, year, report_dir, report_prefix,
                                                 css_filename, js_filename, operator_per_tac_compliance_data))
        statsd.gauge('{0}runtime.per_report.operators.{1}'.format(metrics_run_root, op.id),
                     cp.duration)

    # Store per-report job metadata
    metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)