Пример #1
0
def finalize(args, api, configs):
    """
    Analyzes the results of previous steps and summarize in output

    Args:
        args: arguments captured from CLI
        api: object to perform the API calls
        configs: shared configuration variables used across the script
    """

    logger.info("Checkign results")
    print("\nDATALOAD ROLLBACK RESULTS")

    success_result = configs["success_rollback_handler_filename"]
    fail_result = configs["fail_rollback_handler_filename"]

    print("\t[{}] Total processed users\n".format(configs['total_records']))

    print(
        "\t[{}] Delete success. Number of records deleted in database".format(
            count_lines_in_file(success_result)))
    print("\t[{}] Delete failures".format(count_lines_in_file(fail_result)))

    result_files = [success_result, fail_result]

    result = api.call('entity.count',
                      type_name=args.type_name,
                      timeout=args.timeout)
    print("\t[{}] Total number of records in Entity Type [{}] after execution".
          format(result["total_count"], args.type_name))

    print("\nPlease check detailed results in the files below:")
    for file in result_files:
        print("\t{}".format(file))
Пример #2
0
def dataload_rollback(args, api, configs):
    """
    Creates threads to delete records that were imported.

    Args:
        args: arguments captured from CLI
        api: object to perform the API calls
        configs: shared configuration variables used across the script
    """
    print("\n\nStarting the rollback process.\n")

    data_file = args.data_file
    record_count = count_lines_in_file(data_file)

    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        logger.info("Loading data from file into the '{}' entity type.".format(
            args.type_name))

        print("\tValidating UTF-8 encoding and checking for Byte Order Mark\n")
        # Create a CSV reader which will read the CSV file and return a line.
        reader = CsvReader(data_file)

        # TQDM Progress Bar.
        pbar = tqdm(total=record_count, unit="rec")
        pbar.set_description("Delete Records.")

        # Calculate minimum time per worker thread
        if args.rate_limit > 0:
            min_time = round(args.workers / args.rate_limit, 2)
        else:
            min_time = 0
        logger.debug("Minimum processing time per worker: {}".format(min_time))

        # Iterate over records of rows in the CSV and dispatch delete_record()
        # calls to the worker threads.
        futures = []
        for _, row in enumerate(reader):
            kwargs = {
                'api': api,
                'args': args,
                'uuid': row[2],
                'email': row[3],
                'batch_id': row[0],
                'line': row[1],
                'pbar': pbar,
                'min_time': min_time
            }
            futures.append(executor.submit(delete_record, **kwargs))

        # Iterate over the future results to raise any uncaught exceptions.
        # Note that this means uncaught exceptions will not be raised until
        # AFTER all workers are dispatched.
        logger.info("Waiting for workers to finish")
        for future in futures:
            future.result()

        pbar.close()
        logger.info("Rollback finished!")
Пример #3
0
def dataload_finalize(args, api, configs):
    """
    Analyzes the results of previous steps and summarize in output

    Args:
        args: arguments captured from CLI
        api: object to perform the API calls
        configs: shared configuration variables used across the script
    """

    logger.info("Checkign results")
    print("\nDATALOAD RESULTS")

    success_result = configs["success_handler_filename"]
    fail_result = configs["fail_handler_filename"]
    retry_result = configs["csv_retry_writer"].get_filename()

    print("\t[{}] Total processed users\n".format(configs['total_records']))

    print("\t[{}] Import success. Number of new records inserted in database".
          format(count_lines_in_file(success_result)))
    print("\t[{}] Import failures".format(count_lines_in_file(fail_result)))

    result_files = [success_result, fail_result]

    # If retry file is not empty, add it to the result list and print the info,
    # otherwise, remove the file.
    retry_line_number = count_lines_in_file(retry_result)
    if retry_line_number > 0:
        print("\t[{}] Import retries\n".format(
            count_lines_in_file(retry_result)))
        result_files.append(retry_result)
    else:
        print("\n")
        retry_filename = configs["csv_retry_writer"].get_filename()
        delete_file(retry_filename, logger)

    # Delta migration is enable, get the update log files.
    if args.delta_migration:
        # Append to an existing logger list.
        update_success_result = configs["update_success_handler_filename"]
        update_fail_result = configs["update_fail_handler_filename"]
        result_files.extend((update_success_result, update_fail_result))

        print("\t[{}] Update success. Existing users that were updated".format(
            count_lines_in_file(update_success_result)))
        print("\t[{}] Update failures\n".format(
            count_lines_in_file(update_fail_result)))

    result = api.call('entity.count',
                      type_name=args.type_name,
                      timeout=args.timeout)
    print("\t[{}] Total number of records in Entity Type [{}] after execution".
          format(result["total_count"], args.type_name))

    print("\nPlease check detailed results in the files below:")
    for file in result_files:
        print("\t{}".format(file))
Пример #4
0
def prepare_pbar_total_records(args, dataload_config):
    # Calculating total number of records to be processed and store metric
    total_records = count_lines_in_file(args.data_file)

    if args.start_at > total_records:
        logger.info("No records to be imported after {}".format(total_records))
        print("\tNo records to be imported after {} records.".format(
            total_records))
        sys.exit()

    # Check if start at arg is > than number of records in csv file.
    if args.start_at > 1 and args.start_at <= total_records:
        total_records = (total_records - args.start_at) + 1

    dataload_config.update({'total_records': total_records})
Пример #5
0
def dataload_update(args, api, configs):
    """
    Creates threads to update records if any of them were marked
    as duplicates during import

    Args:
        args: arguments captured from CLI
        api: object to perform the API calls
        configs: shared configuration variables used across the script
    """
    print("\n\nStarting the update process for the duplicated records\n")
    if args.dry_run:
        logger.debug("Dry run. Dataload update was skipped.")
        print("\tDry run mode detected. Skipping dataload update.")
        return

    if not args.delta_migration:
        return

    logger.info("Checking if there are any duplicate records to update")
    print("\tChecking if there are any duplicate records to update\n")
    data_file = configs['csv_tmp_writer'].get_filename()
    record_update_count = count_lines_in_file(data_file)
    plurals = configs['plurals']

    # Check if there is any record to be updated. If none, delete the temporary
    # file and proceed to finalize
    if record_update_count < 1:
        print("\tNo records found to be updated\n")
        logger.info("No records found to be updated")
        delete_file(data_file, logger)
        return
    print("\t{} duplicate records were found and will be updated\n".format(
        record_update_count))

    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        logger.info(
            "Loading data from TEMP file into the '{}' entity type.".format(
                args.type_name))

        # Calculate minimum time per worker thread
        min_time = 0
        if args.rate_limit > 0:
            min_time = round(args.workers / args.rate_limit, 2)

        logger.debug("Minimum processing time per worker: {}".format(min_time))

        print("\tValidating UTF-8 encoding and checking for Byte Order Mark\n")
        # Create a CSV reader which will read the CSV TEMP file and return
        # a entire record.
        reader = CsvReader(data_file)

        # TQDM Progress Bar.
        pbar = tqdm(total=record_update_count, unit="rec")
        pbar.set_description("Updating Records.")

        # Iterate over records of rows in the CSV and dispatch update_record()
        # calls to the worker threads.
        futures = []
        for _, row in enumerate(reader):
            logger.debug(row)
            record_info = {
                'record': row[2],
                'batch_id': row[0],
                'line': row[1]
            }

            kwargs = {
                'api': api,
                'args': args,
                'record_info': record_info,
                'min_time': min_time,
                'pbar': pbar,
                'plurals': plurals
            }
            futures.append(executor.submit(update_record, **kwargs))

        # Iterate over the future results to raise any uncaught exceptions.
        # Note that this means uncaught exceptions will not be raised until
        # AFTER all workers are dispatched.
        logger.info("Waiting for workers to finish")
        for future in futures:
            future.result()

        pbar.close()
        logger.info("Update finished!")

        # Delete the temporary file.
        delete_file(data_file, logger)
Пример #6
0
    logging.config.dictConfig(config)

    # Add header row the the success and failure CSV logs
    success_logger.info("batch,line,uuid,email")
    fail_logger.info("batch,line,error")

    return dataload_config


if __name__ == "__main__":
    """ Main entry point for script being executed from the command line. """
    parser = RollbackArgumentParser()
    args = parser.parse_args()
    api = parser.init_api()

    dataload_config = setup_logging()

    # Calculating total number of records to be processed and store metric
    total_records = count_lines_in_file(args.data_file)

    dataload_config.update({'total_records': total_records})

    kwargs = {
        "args": args,
        "api": api,
        "configs": dataload_config
    }

    dataload_rollback(**kwargs)
    finalize(**kwargs)