def finalize(args, api, configs): """ Analyzes the results of previous steps and summarize in output Args: args: arguments captured from CLI api: object to perform the API calls configs: shared configuration variables used across the script """ logger.info("Checkign results") print("\nDATALOAD ROLLBACK RESULTS") success_result = configs["success_rollback_handler_filename"] fail_result = configs["fail_rollback_handler_filename"] print("\t[{}] Total processed users\n".format(configs['total_records'])) print( "\t[{}] Delete success. Number of records deleted in database".format( count_lines_in_file(success_result))) print("\t[{}] Delete failures".format(count_lines_in_file(fail_result))) result_files = [success_result, fail_result] result = api.call('entity.count', type_name=args.type_name, timeout=args.timeout) print("\t[{}] Total number of records in Entity Type [{}] after execution". format(result["total_count"], args.type_name)) print("\nPlease check detailed results in the files below:") for file in result_files: print("\t{}".format(file))
def dataload_rollback(args, api, configs): """ Creates threads to delete records that were imported. Args: args: arguments captured from CLI api: object to perform the API calls configs: shared configuration variables used across the script """ print("\n\nStarting the rollback process.\n") data_file = args.data_file record_count = count_lines_in_file(data_file) with ThreadPoolExecutor(max_workers=args.workers) as executor: logger.info("Loading data from file into the '{}' entity type.".format( args.type_name)) print("\tValidating UTF-8 encoding and checking for Byte Order Mark\n") # Create a CSV reader which will read the CSV file and return a line. reader = CsvReader(data_file) # TQDM Progress Bar. pbar = tqdm(total=record_count, unit="rec") pbar.set_description("Delete Records.") # Calculate minimum time per worker thread if args.rate_limit > 0: min_time = round(args.workers / args.rate_limit, 2) else: min_time = 0 logger.debug("Minimum processing time per worker: {}".format(min_time)) # Iterate over records of rows in the CSV and dispatch delete_record() # calls to the worker threads. futures = [] for _, row in enumerate(reader): kwargs = { 'api': api, 'args': args, 'uuid': row[2], 'email': row[3], 'batch_id': row[0], 'line': row[1], 'pbar': pbar, 'min_time': min_time } futures.append(executor.submit(delete_record, **kwargs)) # Iterate over the future results to raise any uncaught exceptions. # Note that this means uncaught exceptions will not be raised until # AFTER all workers are dispatched. logger.info("Waiting for workers to finish") for future in futures: future.result() pbar.close() logger.info("Rollback finished!")
def dataload_finalize(args, api, configs): """ Analyzes the results of previous steps and summarize in output Args: args: arguments captured from CLI api: object to perform the API calls configs: shared configuration variables used across the script """ logger.info("Checkign results") print("\nDATALOAD RESULTS") success_result = configs["success_handler_filename"] fail_result = configs["fail_handler_filename"] retry_result = configs["csv_retry_writer"].get_filename() print("\t[{}] Total processed users\n".format(configs['total_records'])) print("\t[{}] Import success. Number of new records inserted in database". format(count_lines_in_file(success_result))) print("\t[{}] Import failures".format(count_lines_in_file(fail_result))) result_files = [success_result, fail_result] # If retry file is not empty, add it to the result list and print the info, # otherwise, remove the file. retry_line_number = count_lines_in_file(retry_result) if retry_line_number > 0: print("\t[{}] Import retries\n".format( count_lines_in_file(retry_result))) result_files.append(retry_result) else: print("\n") retry_filename = configs["csv_retry_writer"].get_filename() delete_file(retry_filename, logger) # Delta migration is enable, get the update log files. if args.delta_migration: # Append to an existing logger list. update_success_result = configs["update_success_handler_filename"] update_fail_result = configs["update_fail_handler_filename"] result_files.extend((update_success_result, update_fail_result)) print("\t[{}] Update success. Existing users that were updated".format( count_lines_in_file(update_success_result))) print("\t[{}] Update failures\n".format( count_lines_in_file(update_fail_result))) result = api.call('entity.count', type_name=args.type_name, timeout=args.timeout) print("\t[{}] Total number of records in Entity Type [{}] after execution". format(result["total_count"], args.type_name)) print("\nPlease check detailed results in the files below:") for file in result_files: print("\t{}".format(file))
def prepare_pbar_total_records(args, dataload_config): # Calculating total number of records to be processed and store metric total_records = count_lines_in_file(args.data_file) if args.start_at > total_records: logger.info("No records to be imported after {}".format(total_records)) print("\tNo records to be imported after {} records.".format( total_records)) sys.exit() # Check if start at arg is > than number of records in csv file. if args.start_at > 1 and args.start_at <= total_records: total_records = (total_records - args.start_at) + 1 dataload_config.update({'total_records': total_records})
def dataload_update(args, api, configs): """ Creates threads to update records if any of them were marked as duplicates during import Args: args: arguments captured from CLI api: object to perform the API calls configs: shared configuration variables used across the script """ print("\n\nStarting the update process for the duplicated records\n") if args.dry_run: logger.debug("Dry run. Dataload update was skipped.") print("\tDry run mode detected. Skipping dataload update.") return if not args.delta_migration: return logger.info("Checking if there are any duplicate records to update") print("\tChecking if there are any duplicate records to update\n") data_file = configs['csv_tmp_writer'].get_filename() record_update_count = count_lines_in_file(data_file) plurals = configs['plurals'] # Check if there is any record to be updated. If none, delete the temporary # file and proceed to finalize if record_update_count < 1: print("\tNo records found to be updated\n") logger.info("No records found to be updated") delete_file(data_file, logger) return print("\t{} duplicate records were found and will be updated\n".format( record_update_count)) with ThreadPoolExecutor(max_workers=args.workers) as executor: logger.info( "Loading data from TEMP file into the '{}' entity type.".format( args.type_name)) # Calculate minimum time per worker thread min_time = 0 if args.rate_limit > 0: min_time = round(args.workers / args.rate_limit, 2) logger.debug("Minimum processing time per worker: {}".format(min_time)) print("\tValidating UTF-8 encoding and checking for Byte Order Mark\n") # Create a CSV reader which will read the CSV TEMP file and return # a entire record. reader = CsvReader(data_file) # TQDM Progress Bar. pbar = tqdm(total=record_update_count, unit="rec") pbar.set_description("Updating Records.") # Iterate over records of rows in the CSV and dispatch update_record() # calls to the worker threads. futures = [] for _, row in enumerate(reader): logger.debug(row) record_info = { 'record': row[2], 'batch_id': row[0], 'line': row[1] } kwargs = { 'api': api, 'args': args, 'record_info': record_info, 'min_time': min_time, 'pbar': pbar, 'plurals': plurals } futures.append(executor.submit(update_record, **kwargs)) # Iterate over the future results to raise any uncaught exceptions. # Note that this means uncaught exceptions will not be raised until # AFTER all workers are dispatched. logger.info("Waiting for workers to finish") for future in futures: future.result() pbar.close() logger.info("Update finished!") # Delete the temporary file. delete_file(data_file, logger)
logging.config.dictConfig(config) # Add header row the the success and failure CSV logs success_logger.info("batch,line,uuid,email") fail_logger.info("batch,line,error") return dataload_config if __name__ == "__main__": """ Main entry point for script being executed from the command line. """ parser = RollbackArgumentParser() args = parser.parse_args() api = parser.init_api() dataload_config = setup_logging() # Calculating total number of records to be processed and store metric total_records = count_lines_in_file(args.data_file) dataload_config.update({'total_records': total_records}) kwargs = { "args": args, "api": api, "configs": dataload_config } dataload_rollback(**kwargs) finalize(**kwargs)