def post_gentemplate_cleanup(argsdict): # this portion of the above function has been separated to allow for individual testing. # normally, we combine chunks, but in the case of styles generation, this is not needed except for roismap. logs.sts("gentemplates_by_tasklists completed.\n", 3) #import pdb; pdb.set_trace() if argsdict['include_maprois']: #styles_completed = DB.list_subdirs_with_filepat('styles', file_pat=r'\.json$', s3flag=None) #attempted_but_failed_styles = [s for s in styles_on_input if s not in styles_completed] logs.sts("Combining roismap for each style into a single .csv file.", 3) DB.combine_dirname_chunks(dirname='styles', subdir="roismap", dest_name='roismap.csv', file_pat=r'_roismap\.csv') good_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_good_maps') fail_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_failed_maps') logs.sts(f"{good_map_num} styles successfully mapped; {fail_map_num} styles did not fully map.", 3) # style logs are placed in one folder in styles # logs are like exc_11010_styles_chunk_84.txt # downloads file_pat=fr"{rootname}_{chunk_pat}\.txt" logs.get_and_merge_s3_logs(dirname='styles', rootname='log', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs') logs.get_and_merge_s3_logs(dirname='styles', rootname='exc', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs')
def combine_dirname_chunks_each_archive(argsdict, dirname): """ combine all the chunks in a specific dirname into {archive_rootname}_{dirname}.csv files, one per archive. Do this in the dirname folder. """ for archive_idx, source in enumerate(argsdict['source']): archive_rootname = os.path.splitext(os.path.basename(source))[0] DB.combine_dirname_chunks( dirname=dirname, subdir='chunks', dest_name=f"{archive_rootname}_{dirname}.csv", file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv")
def cmpcvr_by_tasklists(argsdict: dict): """ ACTIVE Comparison with CVR proceeds using the same chunks as were used in extraction. Each marks tasklist is a BIF table with information about each ballots, one per record. After extractvote is completed, marks_chunks folder contains marks_df.csv for each chunk. As the BIF table is sorted by 'cvrfile', this will reduce the size of CVR that must be loaded. """ utils.sts('cmpcvr by tasklists', 3) # get the list of all extraction tasks in marks/tasks/ subfolder, without .csv extension. # name is like {archive_root}_chunk_{chunk_idx}.csv tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'.*\.csv$', fullpaths=False, no_ext=True) total_num = len(tasklists) utils.sts(f"Found {total_num} tasklists", 3) use_lambdas = argsdict['use_lambdas'] if use_lambdas: LambdaTracker.clear_requests() # The 'extraction_tasks' are ordered also according to archive_root. archive_rootnames = [] for source in argsdict['source']: archive_rootname = os.path.splitext(os.path.basename(source))[0] archive_rootnames.append(archive_rootname) for archive_idx, archive_rootname in enumerate(archive_rootnames): # process the tasklists one archive at a time. cmpcvr_tasks = [t for t in tasklists if t.startswith(archive_rootname)] for chunk_idx, tasklist_name in enumerate(cmpcvr_tasks): #---------------------------------- # this call may delegate to lambdas and return immediately # if 'use_lambdas' is enabled. # otherwise, it blocks until the chunk is completed. # once the lambda is launched, processing continues at # 'delegated_cmpcvr()' below. build_one_chunk(argsdict, dirname='cmpcvr', chunk_idx=chunk_idx, filelist=[tasklist_name], #tasklist name will be like {archive_root}_chunk_{chunk_idx} group_name=archive_rootname, task_name='cmpcvr', incremental=False) #---------------------------------- if not chunk_idx and not archive_idx and argsdict['one_lambda_first']: if not wait_for_lambdas(argsdict, task_name='cmpcvr'): utils.exception_report("task 'cmpcvr' failed delegation to lambdas.") sys.exit(1) wait_for_lambdas(argsdict, task_name='cmpcvr') for archive_rootname in archive_rootnames: #cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv # individual cmpcvr disagreed chunks #cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv # individual cmpcvr overvote chunks DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', dest_name=archive_rootname+'_cmpcvr.csv', file_pat=fr'{archive_rootname}_chunk_\d+\.csv') DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', dest_name=archive_rootname+'disagreed.csv', file_pat=fr'disagreed_{archive_rootname}_chunk_\d+\.csv') DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', dest_name=archive_rootname+'overvotes.csv', file_pat=fr'overvotes_{archive_rootname}_chunk_\d+\.csv') logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='log', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks') logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='exc', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
def genbif_from_ballots(argsdict: dict): """ This function is used when no cvr exists and we need to scan all the ballots to create bifs. This is a slow process, so we create tasklist for lambdas processing. """ if argsdict['use_s3_results']: DB.delete_dirname_files_filtered(dirname='bif', s3flag=True, file_pat=None) DB.delete_dirname_files_filtered(dirname='bif', subdir='chunks', s3flag=True, file_pat=None) # Clear lambda tracker catche if argsdict.get('use_lambdas'): LambdaTracker.clear_requests() max_chunk_size = argsdict.get('genbif_ballots_per_chunk', 200) max_concurrency = argsdict.get('max_lambda_concurrency', 1000) chunk_limit = argsdict.get('genbif_chunk_limit', None) num_archives = len(argsdict['source']) max_concurrency = max_concurrency // num_archives utils.sts('Generating tasklists to scan ballots to create bifs') for archive_idx, source in enumerate(argsdict['source']): archive_basename = os.path.basename(source) archive = open_archive(argsdict, archive_basename) # will open on s3 directly if using s3 file_paths = get_image_file_paths_from_archive(archive) utils.sts(f"Total of {len(file_paths)} image files in the archive") filelist = [] for index, file_path in enumerate(file_paths): _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths) #_, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0]) filelist.append( ';'.join(ballot_file_paths) ) utils.sts(f"Total of {len(filelist)} ballots in the archive") archive.close() chunks_lol = utils.split_list_into_chunks_lol(item_list=filelist, max_chunk_size=max_chunk_size, max_concurrency=max_concurrency) num_chunks = len(chunks_lol) utils.sts(f"Split into {num_chunks} chunks with maximum of {max_chunk_size} ballots each.") #count = 0 # The loop below may delegate processing to lambdas. # Should perform consistency checks here (or before this point) to avoid any costly errors, such as: # 1. output bucket specified exists and is writeable. # It would be best to make these checks as settings file is initially processed. for chunk_idx, filelist in enumerate(chunks_lol): if chunk_limit and chunk_idx >= chunk_limit: break utils.sts(f"Processing chunk #{chunk_idx} with {len(filelist)} ballots", 3) build_one_chunk( argsdict=argsdict, dirname='bif', subdir='chunks', chunk_idx=chunk_idx, filelist=filelist, group_name=archive_basename, task_name='bif', incremental = argsdict['incremental_genbif'] ) # this may delegate to one lambda #count = count+1 if argsdict['use_lambdas'] and not archive_idx and not chunk_idx and argsdict['one_lambda_first']: if not wait_for_lambdas(argsdict, task_name='bif'): utils.exception_report("task 'bif' failed delegation to lambdas.") sys.exit(1) wait_for_lambdas(argsdict, task_name='bif') # @@ wait_for_lambdas should be enhanced to track specific tasks or better use SQS messaging. for archive_idx, source in enumerate(argsdict['source']): archive_rootname = os.path.splitext(os.path.basename(source))[0] dirname = 'bif' DB.combine_dirname_chunks( dirname=dirname, subdir='chunks', dest_name=f"{archive_rootname}_{dirname}.csv", file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv") logs.get_and_merge_s3_logs(dirname='bif', rootname='log', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks') logs.get_and_merge_s3_logs(dirname='bif', rootname='exc', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')