def extractvote_by_tasklists(argsdict: dict):
    """
    ACTIVE
    This replaces the extractvotes function.
    given tasklists which exist in the extraction_tasks folder,

    Tasklists are generated by reviewing the BIF tables.
    Each tasklist creates a separate f"marks_{tasklist_name}.csv" file in the results folder.

    """
    logs.sts('Extracting marks from extraction tasklists', 3)

    tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'^[^~].*\.csv$', fullpaths=False)
    total_num = len(tasklists)
    utils.sts(f"Found {total_num} taskslists", 3)

    use_lambdas = argsdict['use_lambdas']

    if use_lambdas:
        LambdaTracker.clear_requests()
        #clear_instructions(config_d.TASKS_BUCKET, Job.get_path_name())

    biflist = get_biflist(no_ext=True)

    for bif_idx, bifname in enumerate(biflist):
        archive_name = re.sub(r'_bif', '', bifname)
        genmarks_tasks = [t for t in tasklists if t.startswith(archive_name)]
    
        for chunk_idx, tasklist_name in enumerate(genmarks_tasks):
        
            #----------------------------------
            # this call may delegate to lambdas and return immediately
            # if 'use_lambdas' is enabled.
            # otherwise, it blocks until the chunk is completed.
            
            build_one_chunk(argsdict, 
                dirname='marks', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=bifname,
                task_name='extractvote', 
                incremental=False)

            #----------------------------------

            if not chunk_idx and not bif_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='extractvote'):
                    utils.exception_report("task 'extractvote' failed delegation to lambdas.")
                    sys.exit(1)           

    wait_for_lambdas(argsdict, task_name='extractvote')

    utils.combine_dirname_chunks_each_archive(argsdict, dirname='marks')
    logs.get_and_merge_s3_logs(dirname='marks', rootname='log', chunk_pat=r"_chunk_\d+", subdir="chunks")
    logs.get_and_merge_s3_logs(dirname='marks', rootname='exc', chunk_pat=r"_chunk_\d+", subdir="chunks")
def cmpcvr_by_tasklists(argsdict: dict):
    """
    ACTIVE
    Comparison with CVR proceeds using the same chunks as were used in extraction.
    Each marks tasklist is a BIF table with information about each ballots, one per record.
    After extractvote is completed, marks_chunks folder contains marks_df.csv for each chunk.
    As the BIF table is sorted by 'cvrfile', this will reduce the size of CVR that must be loaded.

    """
    utils.sts('cmpcvr by tasklists', 3)

    # get the list of all extraction tasks in marks/tasks/ subfolder, without .csv extension.
    # name is like {archive_root}_chunk_{chunk_idx}.csv 
    tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'.*\.csv$', fullpaths=False, no_ext=True)
    total_num = len(tasklists)
    utils.sts(f"Found {total_num} tasklists", 3)

    use_lambdas = argsdict['use_lambdas']

    if use_lambdas:
        LambdaTracker.clear_requests()

    # The 'extraction_tasks' are ordered also according to archive_root.

    archive_rootnames = []                     
    for source in argsdict['source']:
        archive_rootname = os.path.splitext(os.path.basename(source))[0]
        archive_rootnames.append(archive_rootname)                     

    for archive_idx, archive_rootname in enumerate(archive_rootnames):
        # process the tasklists one archive at a time.
        cmpcvr_tasks = [t for t in tasklists if t.startswith(archive_rootname)]
    
        for chunk_idx, tasklist_name in enumerate(cmpcvr_tasks):
        
            #----------------------------------
            # this call may delegate to lambdas and return immediately
            # if 'use_lambdas' is enabled.
            # otherwise, it blocks until the chunk is completed.
            # once the lambda is launched, processing continues at
            # 'delegated_cmpcvr()' below.
            
            build_one_chunk(argsdict, 
                dirname='cmpcvr', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], #tasklist name will be like {archive_root}_chunk_{chunk_idx}
                group_name=archive_rootname,
                task_name='cmpcvr', 
                incremental=False)
            #----------------------------------

            if not chunk_idx and not archive_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='cmpcvr'):
                    utils.exception_report("task 'cmpcvr' failed delegation to lambdas.")
                    sys.exit(1)           

    wait_for_lambdas(argsdict, task_name='cmpcvr')

    for archive_rootname in archive_rootnames:
    
        #cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr disagreed chunks
        #cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv # individual cmpcvr overvote chunks

        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'_cmpcvr.csv', 
            file_pat=fr'{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'disagreed.csv', 
            file_pat=fr'disagreed_{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'overvotes.csv', 
            file_pat=fr'overvotes_{archive_rootname}_chunk_\d+\.csv')
            
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='log', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='exc', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
Пример #3
0
def gentemplates_by_tasklists(argsdict):
    """
    ACTIVE
    This replaces the gentemplates function.
    given tasklists which exist in the tasklist folder,
    read each in turn and if the number of ballots included meet a minimum,
    process each line item in turn.
    The style is the name of the tasklist.

    Tasklists are generated by reviewing the BIF tables.
    
    Each delegetion to lambdas (or performed locally) will include 
    subprocesses according to the argsdict parameters:
    
        include_gentemplate_tasks       - include the generation of tasklists prior to delegation.
        use_single_template_task_file   - means a single JSON file will be created instead of separate task files on s3
                                            and a portion of that task list will be passed to each lambda
        include_gentemplate             - for each style, combine ballots to create a base template
        include_genrois                 - generate regions of interest (ROIs) and OCR
        include_maprois                 - map the official contest names to what is read on the ballot to create roismap
        

    
    """
    styles_on_input = []
    #attempted_but_failed_styles = []   # will need to determine by looking for templates

    utils.sts('Generating style templates from a combined set of ballot images', 3)

    # this loads and parses the EIF
    contests_dod = create_contests_dod(argsdict)
    #DB.save_style(name='contests_dod', style_data=contests_dod)
    DB.save_data(data_item=contests_dod, dirname='styles', name='contests_dod.json')

    # style_to_contests_dol
    # if the CVR is available, we can get a list of styles that are associated with a ballot_type_id.
    # this may be enough to know exactly what contests are on a given ballot, but only if the 
    # style which keys this list is also directly coupled with the card_code read from the ballot.
    # In some cases, such as Dane County, WI, this is a 1:1 correspondence. But SF has an complex
    # style conversion which is nontrivial to figure out. 
    # thus, this is still needed in style discovery.

    style_to_contests_dol = DB.load_data(dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json', silent_error=True)
    if not style_to_contests_dol:
        logs.sts("CVR_STYLE_TO_CONTESTS_DICT.json not available. Trying to convert CVR to styles", 3)
        style_to_contests_dol = convert_cvr_to_styles(argsdict, silent_error=True)
        if not style_to_contests_dol:
            logs.sts("Unable to convert CVR to style_to_contests_dol, trying manual_styles_to_contests", 3)
            style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
            
    if not style_to_contests_dol:
        logs.sts("style_to_contests_dol unavailable. full style search is required.", 3)

    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    first_pass = True

    if argsdict['use_single_template_task_file']:
        template_tasklists_dolod = DB.load_data(dirname='styles', name="template_tasklists_dolod.json")
        total_num = len(template_tasklists_dolod)
        utils.sts(f"Found {total_num} taskslists", 3)
        
        for chunk_idx, (style_num, style_lod) in enumerate(template_tasklists_dolod.items()):
            if not style_num: continue
            
            if argsdict.get('include_style_num') and style_num not in argsdict['include_style_num'] or \
                argsdict.get('exclude_style_num') and style_num in argsdict['exclude_style_num']:
                continue
            
            styles_on_input.append(style_num)

            if argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                subdir=style_num,
                chunk_idx=chunk_idx, 
                filelist=[style_lod],            # only one style per lambda chunk, but can execute gentemplate, genrois, and maprois for same style.
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )

            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False
            # if not generate_template_for_style_by_tasklist_df(argsdict, style_num, tasklist_df):
                # attempted_but_failed_styles.append(style_num)
        
    else:    
        tasklists = DB.list_files_in_dirname_filtered(dirname='styles', subdir="tasks", file_pat=r'.*\.csv', fullpaths=False)
        total_num = len(tasklists)
        utils.sts(f"Found {total_num} taskslists", 3)

        for chunk_idx, tasklist_name in enumerate(tasklists):
            if tasklist_name == '.csv': continue
            
            style_num = os.path.splitext(os.path.basename(tasklist_name))[0]
            styles_on_input.append(style_num)

            if args.argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )
            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False

    wait_for_lambdas(argsdict, task_name='gentemplate')
    post_gentemplate_cleanup(argsdict)
def get_biflist(fullpaths=False, no_ext=False):
    bifnameslist = DB.list_files_in_dirname_filtered('bif', subdir=None, file_pat=r'bif\.csv$', fullpaths=fullpaths)
    if no_ext:
        bifnameslist = [os.path.splitext(f)[0] for f in bifnameslist]
    return bifnameslist