示例#1
0
def post_gentemplate_cleanup(argsdict):
    # this portion of the above function has been separated to allow for individual testing.

    # normally, we combine chunks, but in the case of styles generation, this is not needed except for roismap.

    logs.sts("gentemplates_by_tasklists completed.\n", 3)
    
    #import pdb; pdb.set_trace()

    if argsdict['include_maprois']:
        #styles_completed = DB.list_subdirs_with_filepat('styles', file_pat=r'\.json$', s3flag=None)
        #attempted_but_failed_styles = [s for s in styles_on_input if s not in styles_completed]

        logs.sts("Combining roismap for each style into a single .csv file.", 3)
        DB.combine_dirname_chunks(dirname='styles', subdir="roismap", dest_name='roismap.csv', file_pat=r'_roismap\.csv')

        good_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_good_maps')
        fail_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_failed_maps')
        
        logs.sts(f"{good_map_num} styles successfully mapped; {fail_map_num} styles did not fully map.", 3)
    
    # style logs are placed in one folder in styles
    # logs are like exc_11010_styles_chunk_84.txt
    # downloads file_pat=fr"{rootname}_{chunk_pat}\.txt"
    logs.get_and_merge_s3_logs(dirname='styles', rootname='log', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs')
    logs.get_and_merge_s3_logs(dirname='styles', rootname='exc', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs')
def combine_dirname_chunks_each_archive(argsdict, dirname):
    """ combine all the chunks in a specific dirname into {archive_rootname}_{dirname}.csv files, one per archive.
        Do this in the dirname folder.
    """

    for archive_idx, source in enumerate(argsdict['source']):
        archive_rootname = os.path.splitext(os.path.basename(source))[0]
        DB.combine_dirname_chunks(
            dirname=dirname,
            subdir='chunks',
            dest_name=f"{archive_rootname}_{dirname}.csv",
            file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv")
def cmpcvr_by_tasklists(argsdict: dict):
    """
    ACTIVE
    Comparison with CVR proceeds using the same chunks as were used in extraction.
    Each marks tasklist is a BIF table with information about each ballots, one per record.
    After extractvote is completed, marks_chunks folder contains marks_df.csv for each chunk.
    As the BIF table is sorted by 'cvrfile', this will reduce the size of CVR that must be loaded.

    """
    utils.sts('cmpcvr by tasklists', 3)

    # get the list of all extraction tasks in marks/tasks/ subfolder, without .csv extension.
    # name is like {archive_root}_chunk_{chunk_idx}.csv 
    tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'.*\.csv$', fullpaths=False, no_ext=True)
    total_num = len(tasklists)
    utils.sts(f"Found {total_num} tasklists", 3)

    use_lambdas = argsdict['use_lambdas']

    if use_lambdas:
        LambdaTracker.clear_requests()

    # The 'extraction_tasks' are ordered also according to archive_root.

    archive_rootnames = []                     
    for source in argsdict['source']:
        archive_rootname = os.path.splitext(os.path.basename(source))[0]
        archive_rootnames.append(archive_rootname)                     

    for archive_idx, archive_rootname in enumerate(archive_rootnames):
        # process the tasklists one archive at a time.
        cmpcvr_tasks = [t for t in tasklists if t.startswith(archive_rootname)]
    
        for chunk_idx, tasklist_name in enumerate(cmpcvr_tasks):
        
            #----------------------------------
            # this call may delegate to lambdas and return immediately
            # if 'use_lambdas' is enabled.
            # otherwise, it blocks until the chunk is completed.
            # once the lambda is launched, processing continues at
            # 'delegated_cmpcvr()' below.
            
            build_one_chunk(argsdict, 
                dirname='cmpcvr', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], #tasklist name will be like {archive_root}_chunk_{chunk_idx}
                group_name=archive_rootname,
                task_name='cmpcvr', 
                incremental=False)
            #----------------------------------

            if not chunk_idx and not archive_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='cmpcvr'):
                    utils.exception_report("task 'cmpcvr' failed delegation to lambdas.")
                    sys.exit(1)           

    wait_for_lambdas(argsdict, task_name='cmpcvr')

    for archive_rootname in archive_rootnames:
    
        #cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr disagreed chunks
        #cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv # individual cmpcvr overvote chunks

        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'_cmpcvr.csv', 
            file_pat=fr'{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'disagreed.csv', 
            file_pat=fr'disagreed_{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'overvotes.csv', 
            file_pat=fr'overvotes_{archive_rootname}_chunk_\d+\.csv')
            
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='log', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='exc', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
def genbif_from_ballots(argsdict: dict):
    """
    This function is used when no cvr exists and we need to scan all the
    ballots to create bifs. This is a slow process, so we create
    tasklist for lambdas processing.
    """

    if argsdict['use_s3_results']:
        DB.delete_dirname_files_filtered(dirname='bif', s3flag=True, file_pat=None)
        DB.delete_dirname_files_filtered(dirname='bif', subdir='chunks', s3flag=True, file_pat=None)

    # Clear lambda tracker catche
    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    max_chunk_size = argsdict.get('genbif_ballots_per_chunk', 200)
    max_concurrency = argsdict.get('max_lambda_concurrency', 1000)
    chunk_limit = argsdict.get('genbif_chunk_limit', None)
    num_archives = len(argsdict['source'])
    max_concurrency = max_concurrency // num_archives

    utils.sts('Generating tasklists to scan ballots to create bifs')
    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive = open_archive(argsdict, archive_basename) # will open on s3 directly if using s3
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        filelist = []
        for index, file_path in enumerate(file_paths):
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            #_, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            filelist.append( ';'.join(ballot_file_paths) )
        utils.sts(f"Total of {len(filelist)} ballots in the archive")
        archive.close()

        chunks_lol = utils.split_list_into_chunks_lol(item_list=filelist, max_chunk_size=max_chunk_size, max_concurrency=max_concurrency)
        num_chunks = len(chunks_lol)
        utils.sts(f"Split into {num_chunks} chunks with maximum of {max_chunk_size} ballots each.")
        #count = 0
        
        # The loop below may delegate processing to lambdas.
        # Should perform consistency checks here (or before this point) to avoid any costly errors, such as:
        #   1. output bucket specified exists and is writeable.
        # It would be best to make these checks as settings file is initially processed.
        
        
        for chunk_idx, filelist in enumerate(chunks_lol):
            if chunk_limit and chunk_idx >= chunk_limit:
                break
            utils.sts(f"Processing chunk #{chunk_idx} with {len(filelist)} ballots", 3)
            
            build_one_chunk(
                argsdict=argsdict,
                dirname='bif',
                subdir='chunks',
                chunk_idx=chunk_idx, 
                filelist=filelist, 
                group_name=archive_basename, 
                task_name='bif',
                incremental = argsdict['incremental_genbif']
                )   # this may delegate to one lambda
            #count = count+1
            if argsdict['use_lambdas'] and not archive_idx and not chunk_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='bif'):
                    utils.exception_report("task 'bif' failed delegation to lambdas.")
                    sys.exit(1)           


    wait_for_lambdas(argsdict, task_name='bif')      # @@ wait_for_lambdas should be enhanced to track specific tasks or better use SQS messaging.
    
    for archive_idx, source in enumerate(argsdict['source']):
        archive_rootname = os.path.splitext(os.path.basename(source))[0]

        dirname = 'bif'

        DB.combine_dirname_chunks(
            dirname=dirname, subdir='chunks', 
            dest_name=f"{archive_rootname}_{dirname}.csv", 
            file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv")
            
        logs.get_and_merge_s3_logs(dirname='bif', rootname='log', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='bif', rootname='exc', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')