def save_ballot(self):
        """
        Saves ballot data to JSON file. It coverts ballot attributes
        to a dictionary on its own with 'get_ballot_data' helper or
        use passed 'data' dictionary.
        """

        DB.save_data(data_item=self.ballotdict,
                     dirname='results',
                     name=self.ballotdict['ballot_id'] + '.json',
                     subdir=self.ballotdict['precinct'])
    def save_ballot_pdf(self):
        """Extracts ballot pdf file to be able to view it in the web browser.
            This appears to be unused.
        
        """
        precinct = self.ballotdict['precinct']
        ballot_id = self.ballotdict['ballot_id']
        pdf_file = self.ballotimgdict['pdf_file']

        DB.save_data(data_item=pdf_file.get('bytes_array'),
                     dirname='disagreements',
                     name=f'{ballot_id}.pdf',
                     format='.pdf',
                     subdir=precinct)
def delegated_build_bif_chunk(dirname, task_args, s3flag=None):
    """ this function is suitable for execution in lambda after delegation
        can also use by local machine even if s3 is used for output.
    """

    # task_args: argsdict, archive_basename, chunk_idx, filelist
    args.argsdict = argsdict = task_args['argsdict']
    
    chunk_idx   = task_args['chunk_idx']
    filelist    = task_args['filelist']                         # the list of files to be processed in this chunk.
    subdir      = task_args['subdir']
    chunk_name  = task_args['chunk_name']
    
    archive_basename = task_args['group_name']
    archive = open_archive(argsdict, archive_basename)          # if using s3, this will open the archive on s3.
    full_file_list = get_file_paths(archive)
    if not full_file_list:
        raise LookupError(f"archive {archive_basename} appears empty")

    pstyle_region_dict = argsdict.get('pstyle_region')
    pstyle_pattern = argsdict.get('pstyle_pattern', '')

    df_dict = {}        # to save time, we will build the dataframe as a dict of dict, then in one swoop create the dataframe.
                        # format is {1: {'lkadsjf': asdlkfj, }, 2: {...} ...)
    
    #filelist = filelist[0:5]
    for index, file_paths in enumerate(filelist):
    
        ballot_file_paths = re.split(r';', file_paths)
        _, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

        df_dict[index] = create_bif_dict_by_reading_ballot(argsdict, 
                                                            ballot_id, 
                                                            index, 
                                                            archive_basename, 
                                                            archive, 
                                                            ballot_file_paths,
                                                            pstyle_region_dict, 
                                                            pstyle_pattern,
                                                            chunk_idx)
    # create the dataframe all at once.
    #print(df_dict)
    chunk_df = pd.DataFrame.from_dict(df_dict, "index")

    DB.save_data(data_item=chunk_df, dirname=dirname, subdir=subdir, name=chunk_name, format='.csv', s3flag=s3flag)
Пример #4
0
def delegated_gentemplate(dirname, task_args, s3flag=None):
    args.argsdict = argsdict = task_args['argsdict']
    
    chunk_idx   = task_args['chunk_idx']
    tasklist    = task_args['filelist']         # bif segment defining ballots included 
    style_num   = task_args['group_name']
    
    if isinstance(tasklist[0], str):
        # when using individual files, tasklist[0] is the tasklist file name.
        tasklist_lod = DB.load_data(dirname='styles', subdir='tasks', name=tasklist[0], format='.csv', type='lod')
    else:
        tasklist_lod = tasklist[0]
    
    if argsdict['include_gentemplate']:
        # generate a "blank" ballot image for this style in dirname 'styles'
        generate_template_for_style_by_tasklist_lod(argsdict, tasklist_lod=tasklist_lod)
    
    style_rois_list = None
    if argsdict['include_genrois']:
        # generate rois information to dirname 'rois'
        style_rois_list = genrois.genrois_one_style(argsdict, style_num)

    if argsdict['include_maprois']:
        style_rois_map_df, error_flag = maprois.maprois_discover_style(
            argsdict,
            style_num,
            style_rois_list=style_rois_list,
            #rois_map_df=None,
            contests_dod=None,
            style_to_contests_dol=None,
            )
            
        #import pdb; pdb.set_trace()
        if error_flag or not len(style_rois_map_df.index):
            logs.exception_report(f"Failed to map style:{style_num}")
            logs.report_lambda_logfile(s3dirname='styles', chunk_name=f"{style_num}_styles_chunk_{chunk_idx}", rootname='map_report', subdir='logs_failed_maps')
        else:
            logs.report_lambda_logfile(s3dirname='styles', chunk_name=f"{style_num}_styles_chunk_{chunk_idx}", rootname='map_report', subdir='logs_good_maps')
            create_redlined_images(argsdict, style_num, style_rois_map_df)
            DB.save_data(data_item=style_rois_map_df, dirname='styles', subdir='roismap', name=f"{style_num}_roismap", format='.csv')
def report_lambda_logfile(s3dirname, chunk_name, rootname="log", subdir=None):
    """ copy lambda logfile at /tmp/log.txt to s3dirname
        only if it exists and has nonzero size.
    """
    logfile_pathname = get_logfile_pathname(rootname=rootname)  # this generates the path to the lambda or local folder for the logs.
    upload_name = f"{rootname}_{chunk_name}"
    print(f"Reading logfile {logfile_pathname}")
    #import pdb; pdb.set_trace()
    buff = read_logfile(logfile_pathname)
    print(f"Saving logfile {logfile_pathname} to {s3dirname} as {upload_name}")
    if buff:
        file_path = DB.save_data(data_item=buff, dirname=s3dirname, name=f"{rootname}_{chunk_name}", format='.txt', subdir=subdir)
        print(f"logfile {rootname}, {len(buff)} characters saved to {file_path}")
def cmpcvr_by_one_tasklist(argsdict, tasklist_name):
    """ This is the primary function to be run inside lambda for cmpcvr.
    
        tasklist_name is like "{archive_root}_chunk_{chunk_idx}"
    """
    # set s3 vs local mode -- this probably better done long before this point.
    DB.set_DB_mode()        

    contests_dod = DB.load_data('styles', 'contests_dod.json')
    if CVR.data_frame.empty:
        CVR.load_cvrs_to_df(argsdict)
    
    #        marks/chunks/{archive_root}_chunk_{chunk_idx}.csv           # individual marks chunks. These are kept for cmpcvr


    if not DB.file_exists(file_name=tasklist_name+'.csv', dirname='marks', subdir="chunks"):
        utils.sts(f"Logic Error: no marks df missing: {tasklist_name}")
        traceback.print_stack()
        sys.exit(1)

    audit_df = DB.load_data(dirname='marks', subdir="chunks", name=tasklist_name, format='.csv')
    
    #---------------------------------------
    # primary call of this function performs chunk comparison
    
    overvotes_results, disagreed_results, blank_results = compare_chunk_with_cvr(
        argsdict=argsdict,
        contests_dod=contests_dod,
        cvr_df=CVR.data_frame,
        audit_df=audit_df,
        chunk_name=tasklist_name,
        )
    #---------------------------------------
    """
        cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr disagreed chunks
        cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr overvote chunks
    """
        
       
    DB.save_data(data_item=disagreed_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"disagreed-{tasklist_name}.csv")

    DB.save_data(data_item=disagreed_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"overvotes-{tasklist_name}.csv")

    DB.save_data(data_item=blank_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"blanks-{tasklist_name}.csv")
Пример #7
0
def build_template_tasklists(argsdict):
    """ with all bif chunks created, scan them and create template_tasklists.
        each tasklist contains records from bif for ballots to be included
        in the template. These are written to template_tasklists folder.
        
        Note that this processes BIFs one at a time, rather than combining
        them all in memory, which is not scalable.
    """

    utils.sts("Building template tasklists...", 3)
    
    incomplete_style_ballots_dodf = {}      # dict keyed by style of df
    completed_eff_styles_dodf = {}          # 

    num_ballots_to_combine = argsdict.get('threshold', 50)

    # then following works even if bif is generated from CVR.
    # because the separate bif csv files are still produced.
    bif_names = get_biflist(fullpaths=False)

    if argsdict['merge_similar_styles']:
        
        #sheetstyle_map_dict = DB.load_json('styles', 'sheetstyle_map_dict.json', silent_error=False)
        sheetstyle_map_dict = DB.load_data(dirname='styles', name='sheetstyle_map_dict.json')

    for bif_name in bif_names:
        utils.sts(f"  Processing bif {bif_name}...", 3)
        
        BIF.load_bif(name=bif_name)
        reduced_df = BIF.df_without_corrupted_and_bmd()
        reduced_df = set_style_from_party_if_enabled(argsdict, reduced_df)
        
        style_nums_in_this_bif = list(reduced_df['style_num'].unique())
        utils.sts(f"  Found {len(style_nums_in_this_bif)} unique styles", 3)

        for style_num in style_nums_in_this_bif:
            utils.sts(f"Processing style:{style_num} ", 3, end='')
            previously_captured = 0

            eff_style = style_num
            if argsdict['merge_similar_styles']:
                eff_style = sheetstyle_map_dict[style_num[1:]]          # skip language char.
                # this is the contests-only style on per sheet basis.
                # it does not have language included. So we add the language from original style
                lang_code = style_num[0:1]                              # first char
                eff_style = "%1.1u%4.4u" % (int(lang_code), int(eff_style))

                utils.sts(f"Effective (merged) style is:{eff_style} ", 3, end='')
            
            if eff_style in completed_eff_styles_dodf:
                utils.sts(" Tasklist already created", 3)
                continue

            # first see if we were already working on this style
            if eff_style in incomplete_style_ballots_dodf:
                previously_captured = len(incomplete_style_ballots_dodf[eff_style].index)
                utils.sts(f"Previously captured {previously_captured} ", 3, end='')
            # find records with this eff_style

            style_df = reduced_df[(reduced_df['style_num'] == style_num)][0:(num_ballots_to_combine-previously_captured)]
            utils.sts(f" Just Captured {len(style_df.index)}", 3, end='')

            if previously_captured:
                style_df = incomplete_style_ballots_dodf[eff_style].append(style_df, ignore_index=True)
                utils.sts(f" Total captured {len(style_df.index)}", 3, end='')
            if len(style_df.index) >= num_ballots_to_combine:
                completed_eff_styles_dodf[eff_style] = style_df
                try:
                    del incomplete_style_ballots_dodf[eff_style]
                except: pass
                utils.sts(" Full", 3)
            else:
                utils.sts(" Queued", 3)
                incomplete_style_ballots_dodf[eff_style] = style_df


    # skip those that have too few records, i.e. < min_ballots_required

    min_ballots_required = argsdict.get('min_ballots_required', 1)
    too_few_ballots_styles = []
    template_tasklists_dodf = {}
    for eff_style, style_df in {**completed_eff_styles_dodf, **incomplete_style_ballots_dodf}.items():
        num_records = len(style_df.index)
        if num_records < min_ballots_required:
            utils.sts(f"Style has too few records, {min_ballots_required} ballots are required, skipping...", 3)
            too_few_ballots_styles.append(style_num)
            continue
        template_tasklists_dodf[eff_style] = style_df
        
    # write tasklists
    utils.sts("\n  Writing tasklists:", 3)
    if not argsdict['use_single_template_task_file']:
        for eff_style, style_df in template_tasklists_dodf.items():
            utils.sts(f"  Writing tasklists for style:{eff_style} with {'%2.2u' % (len(style_df.index))} entries ", 3, end='')
            style_df.sort_values(by=['archive_basename'], inplace=True)
            pathname = DB.save_data(data_item=style_df, dirname='styles', subdir='tasks', name=str(eff_style), format='.csv')
            utils.sts(f"to {pathname}", 3)
    else:
        template_tasklists_dolod = utils.dodf_to_dolod(template_tasklists_dodf)
        utils.sts(f"Writing combined tasklists with {'%2.2u' % (len(template_tasklists_dolod))} tasklists ", 3, end='')
        DB.save_data(data_item=template_tasklists_dolod, dirname='styles', name="template_tasklists_dolod.json")

    completed_count = len(completed_eff_styles_dodf)
    incompleted_count = len(incomplete_style_ballots_dodf)

    utils.sts(  f"Total number of styles detected: {completed_count + incompleted_count} \n"
                f"            Completed tasklists: {completed_count}\n"
                f"   Incomplete tasklists created: {incompleted_count}\n"
                f"    Styles will too-few ballots: {too_few_ballots_styles}\n"
                , 3)
Пример #8
0
def gentemplates_by_tasklists(argsdict):
    """
    ACTIVE
    This replaces the gentemplates function.
    given tasklists which exist in the tasklist folder,
    read each in turn and if the number of ballots included meet a minimum,
    process each line item in turn.
    The style is the name of the tasklist.

    Tasklists are generated by reviewing the BIF tables.
    
    Each delegetion to lambdas (or performed locally) will include 
    subprocesses according to the argsdict parameters:
    
        include_gentemplate_tasks       - include the generation of tasklists prior to delegation.
        use_single_template_task_file   - means a single JSON file will be created instead of separate task files on s3
                                            and a portion of that task list will be passed to each lambda
        include_gentemplate             - for each style, combine ballots to create a base template
        include_genrois                 - generate regions of interest (ROIs) and OCR
        include_maprois                 - map the official contest names to what is read on the ballot to create roismap
        

    
    """
    styles_on_input = []
    #attempted_but_failed_styles = []   # will need to determine by looking for templates

    utils.sts('Generating style templates from a combined set of ballot images', 3)

    # this loads and parses the EIF
    contests_dod = create_contests_dod(argsdict)
    #DB.save_style(name='contests_dod', style_data=contests_dod)
    DB.save_data(data_item=contests_dod, dirname='styles', name='contests_dod.json')

    # style_to_contests_dol
    # if the CVR is available, we can get a list of styles that are associated with a ballot_type_id.
    # this may be enough to know exactly what contests are on a given ballot, but only if the 
    # style which keys this list is also directly coupled with the card_code read from the ballot.
    # In some cases, such as Dane County, WI, this is a 1:1 correspondence. But SF has an complex
    # style conversion which is nontrivial to figure out. 
    # thus, this is still needed in style discovery.

    style_to_contests_dol = DB.load_data(dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json', silent_error=True)
    if not style_to_contests_dol:
        logs.sts("CVR_STYLE_TO_CONTESTS_DICT.json not available. Trying to convert CVR to styles", 3)
        style_to_contests_dol = convert_cvr_to_styles(argsdict, silent_error=True)
        if not style_to_contests_dol:
            logs.sts("Unable to convert CVR to style_to_contests_dol, trying manual_styles_to_contests", 3)
            style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
            
    if not style_to_contests_dol:
        logs.sts("style_to_contests_dol unavailable. full style search is required.", 3)

    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    first_pass = True

    if argsdict['use_single_template_task_file']:
        template_tasklists_dolod = DB.load_data(dirname='styles', name="template_tasklists_dolod.json")
        total_num = len(template_tasklists_dolod)
        utils.sts(f"Found {total_num} taskslists", 3)
        
        for chunk_idx, (style_num, style_lod) in enumerate(template_tasklists_dolod.items()):
            if not style_num: continue
            
            if argsdict.get('include_style_num') and style_num not in argsdict['include_style_num'] or \
                argsdict.get('exclude_style_num') and style_num in argsdict['exclude_style_num']:
                continue
            
            styles_on_input.append(style_num)

            if argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                subdir=style_num,
                chunk_idx=chunk_idx, 
                filelist=[style_lod],            # only one style per lambda chunk, but can execute gentemplate, genrois, and maprois for same style.
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )

            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False
            # if not generate_template_for_style_by_tasklist_df(argsdict, style_num, tasklist_df):
                # attempted_but_failed_styles.append(style_num)
        
    else:    
        tasklists = DB.list_files_in_dirname_filtered(dirname='styles', subdir="tasks", file_pat=r'.*\.csv', fullpaths=False)
        total_num = len(tasklists)
        utils.sts(f"Found {total_num} taskslists", 3)

        for chunk_idx, tasklist_name in enumerate(tasklists):
            if tasklist_name == '.csv': continue
            
            style_num = os.path.splitext(os.path.basename(tasklist_name))[0]
            styles_on_input.append(style_num)

            if args.argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )
            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False

    wait_for_lambdas(argsdict, task_name='gentemplate')
    post_gentemplate_cleanup(argsdict)
Пример #9
0
def get_styles_to_contests_dominion(argsdict, ballot_type_contest_manifest='BallotTypeContestManifest.json',
                                    contest_manifest='ContestManifest.json', just_ids=False, silent_error=False):
    """
    
    Builds a styles to contests dict, where styles are ballot type_id.
    It requires BIF files to work and "BallotTypeContestManifest.json",
    "ContestManifest.json" like files in the Dominion CVR ZIP.
    :just_ids: Set to True returns "styles_to_contests_dict.json" with
    "ballot_type_id > contest_ids" instead of "ballot_type_id > contest_names".
    
    Assumes the various manifest files are in a single cvr zip file.
    
    """
    contest_id_to_names = {}
    ballot_type_to_contests = {}
    cvr_file = argsdict.get('cvr')[0]
    utils.sts(f'Loading CVR {cvr_file}')
    cvr_archive = open_archive(argsdict, cvr_file, testzip=False, silent_error=silent_error)

    # First open contests manifest to build dict
    # contest id > contest name.
    try:
        with cvr_archive.open(contest_manifest) as manifest_file:
            utils.sts(f'Loaded {manifest_file}')
            data = json.loads(manifest_file.read()).get('List')
    except (FileNotFoundError, ValueError) as error:
        if not silent_error:
            logs.exception_report(f"Could not load {contest_manifest} from CVR archive {cvr_file} due to %s", error)
            sys.exit(1)
        else:
            return None
    
    utils.sts(f'Loaded manifest data, {len(data)} rows found')
    for row in data:
        contest_id = str(row.get('Id'))
        contest_name = row.get('Description')
        contest_id_to_names[contest_id] = contest_name
    utils.sts(f'Ballot type ids to contests dict built, {len(contest_id_to_names)} rows found')
    del data

    # Then open ballot type confest manifest to build dict
    # ballot type id > list of contest names/ids.
    try:
        with cvr_archive.open(ballot_type_contest_manifest) as manifest_file:
            utils.sts(f'Loaded {manifest_file}')
            data = json.loads(manifest_file.read()).get('List')
    except (FileNotFoundError, ValueError) as error:
        if not silent_error:
            logs.exception_report(f"Could not load {manifest_file} from CVR archive {cvr_file} due to %s", error)
            sys.exit(1)
        else:
            return None
            
    utils.sts(f'Loaded manifest data, {len(data)} rows found')
    for row in data:
        type_id = row.get('BallotTypeId')
        contest_id = str(row.get('ContestId'))
        contest_name = contest_id_to_names.get(contest_id) if not just_ids else contest_id
        if not ballot_type_to_contests.get(type_id):
            ballot_type_to_contests[type_id] = [contest_name]
        else:
            ballot_type_to_contests[type_id].append(contest_name)
    utils.sts(f'Ballot type ids to contests dict built, {len(ballot_type_to_contests)} rows found')
    del data

    #DB.save_json('styles', f"{config_dict['CVR_STYLE_TO_CONTESTS_DICT_FILENAME']}.json", ballot_type_to_contests)
    DB.save_data(ballot_type_to_contests, 'styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')

    
    # at this point, the styles_to_contests dict of list is created, where the key is the ballot_type_id
    # for each style in this list, split it between pages.    

    if False: # this needs to be updated. Not currently used. argsdict['merge_similar_styles']:
    
        # NOTE: It is invalid to merge styles at this point, only based on the contests in them, due to language differences.
        
        # for a given sheet, we may be able to merge styles while still respecting language differences. 
        # given ballot_type_id, look up contests on ballot.
        # using EIF, split contest list into separate list for each sheet.
        # for this sheet, compare list of contests with sheet_based_style_list
       
        contests_dod = create_contests_dod(argsdict)    # this reads the EIF
        
        sheetstyle_dol = {}

        for type, contest_list in ballot_type_to_contests.items():
            
            grouped_dol = utils.group_list_by_dod_attrib(contest_list, contests_dod, 'sheet0')    
                # Access EIF to get the sheet information for each contest.
                # this produces a dict with groups names for each sheet value
                
                # input might be: contest_list = ['contest1', 'contest2', contest3,... ]
                #                 contests_dod = {'contest1': {'sheet0':0}, 'contest2', {'sheet0':0}, 'contest3': {'sheet0':1}, 'contest4': {'sheet0':1},... ]
                # output: grouped_dol {0: ['contest1', 'contest2'], 1: ['contest3', 'contest4'] }
                
            for sheet0, contest_list in grouped_dol.items():
                if not contest_list: continue
                
                sheetstyle_num = "%1.1u%3.3u" % (sheet0 + 1, type)
                sheetstyle_dol[sheetstyle_num] = contest_list
                
        # now each ballot_type_id, which includes the contests for all sheets, has been split 
        # into separate styles for each sheet, and with only those contests for that sheet included.
        
        reduced_sheetstyle_dict, sheetstyle_map_dict = utils.reduce_dict(sheetstyle_dol)
        
        # the reduced_sheetstyle_dict includes a minmal subset of those sheetstyles that are unique.
        # the sheetstyle_map_dict provides a way to find the same list using the redundant key.
        
        #DB.save_json('styles', 'reduced_sheetstyle_dict.json', reduced_sheetstyle_dict)
        DB.save_data(reduced_sheetstyle_dict, 'styles', name='reduced_sheetstyle_dict.json')
        #DB.save_json('styles', 'sheetstyle_map_dict.json', sheetstyle_map_dict)
        DB.save_data(sheetstyle_map_dict, 'styles', name='sheetstyle_map_dict.json')
Пример #10
0
def convert_cvr_to_styles_ess(argsdict: dict = None, silent_error: bool = False):
    """ ACTIVE -- this is used to create BIF.
        open each of the ess cvr files and create cvr_ballotid_to_style_dict
        by reading Ballot Style column.
        returns the cvr_ballotid_to_style_dict
    """
    
    if not argsdict['cvr'] or argsdict['cvr'] == ['(not available)'] or argsdict['cvr'] == ['']:
        utils.sts("CVR file not specified")
        if silent_error:
            return {}
        else:
            sys.exit(1)
    cvr_replacement_header_list = get_replacement_cvr_header(argsdict)
    master_styles_dict = {}
    cvr_ballotid_to_style_dict = {}
    for cvr_file in argsdict['cvr']:
        utils.sts(f"Processing cvr file: {cvr_file}", 3)
        #cvr_df = pd.read_excel(cvr_file, engine='xlrd')
        cvr_df = DB.load_data(dirname='archives', name=cvr_file, user_format=False)
        
        # probably all of this jazz below should be encapsulated.

        if cvr_replacement_header_list:
            # use the official contest names for column headers instead of those provided.
            orig_col_names = cvr_df.columns
            if len(orig_col_names) != len(cvr_replacement_header_list):
                utils.sts("official contest names list not right length to replace header names in CVR")
                sys.exit(1)
            # we will replace any "blank" col names with "Unnamed: XXX" so we can remove them later.
            for i, orig_col_name in enumerate(orig_col_names):
                if re.match(r'^Unnamed:', orig_col_name):
                    cvr_replacement_header_list[i] = orig_col_name
            cvr_df.columns = cvr_replacement_header_list

        # remove columns that had no names. These are when vote_for is > 1.
        dup_col_names = []
        for column in list(cvr_df.columns):
            if re.match(r'^Unnamed:', column):
                dup_col_names.append(column)
        cvr_df.drop(columns=dup_col_names, inplace=True)

        # remove leading and trailing spaces.
        if argsdict.get('check_dup_contest_names', True):
            duplicates = utils.find_duplicates(cvr_df.columns)
            if duplicates:
                string = '\n'.join(duplicates)
                utils.sts(f'Duplicate columns detected in CVR. All contest names must be unique.\n'
                          f'{string}')
                sys.exit(1)
        utils.sts('Generating cvr_to_styles_dict', 3)
        styles_dict = cvr_to_styles_dict(argsdict, cvr_df)
        
        utils.sts('Generated cvr_to_styles_dict OK', 3)
        # combine with the master_styles_dict, discarding any duplicates that might span cvr blocks.
        master_styles_dict = {**master_styles_dict, **styles_dict}
        ballotid_to_style_dict = cvr_to_ballotid_to_style_dict(cvr_df)
        cvr_ballotid_to_style_dict = {**cvr_ballotid_to_style_dict, **ballotid_to_style_dict}

    total_styles = len(master_styles_dict)

    utils.sts(f"Total of {total_styles} unique styles detected.\nWriting styles to contests dict to JSON file...", 3)

    DB.save_data(master_styles_dict, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
    
    return cvr_ballotid_to_style_dict
Пример #11
0
def main():
    utils.show_logo()
    print(  f"\n\n{'=' * 50}")

    argsdict = args.get_args()          # parses input_file as specifed in CLI using arg_specs.csv
    args.argsdict = argsdict
    
    print("argsdict:")
    print(pprint.pformat(argsdict))

    print(  f"\n\n{'=' * 50}")

    if (argsdict.get('self_test')):
        self_test.self_test(argsdict)


    """ The paths of archives is normalized to allow the archives to be either local or on s3.
        'archives_folder_path' -- path to folder on local system.
        'archives_folder_s3path' -- s3path to folder on s3
        'source' list are basenames, without path, but including extension.
        
    """


    # if argsdict['archives_folder_path'] and not argsdict['source']:
        # # create a list of source archives in the source folder.
        # srcdict = {}
        # dirdict = utils.get_dirdict(argsdict['archives_folder_path'], '.zip')
        # for name, path in dirdict.items():

            # if (name in argsdict['exclude_archives'] or
                # argsdict['include_archives'] and not name in argsdict['include_archives']):
                # continue
            # srcdict[name] = path

        # argsdict['source'] = list(srcdict.values())
        # argsdict['srcdict'] = srcdict
        # utils.sts(f"input directive 'source' resolved to: {argsdict['source']}", 3)

    op = argsdict.get('op', 'all').lower()
    
    DB.set_DB_mode()
    
    """ =======================================================================
        PRIMARY API ENTRY POINTS
        
        Each one of the following relies on a job file which provides the settings
        as parameter,value in csv file, where comments are allowed preceded by #.
        Thus the api must provide 
            -i path             location of settings file -- could be file on s3.
            -op operation       string like 'genbif_from_cvr'
            
        Each function produces:
            log.txt                 appends extensive status reports.
            exception_report.txt    appends each exception encountered. 
                                        exceptions to processing and not python exceptions, per se.
                                        
            as well as other files, noted below.
            
        Initial implementation will include one major intry point with operation selection as follows:
            'genbif_from_cvr'           (Fast)
            'genbif_from_ballots'       (Slow)
            'create_bif_report'         (Fast)
            'gentemplates'              (Slow)
            'genmaprois'                (Somewhat slow)
            'extractvote'               (Very slow)
            'genreport'                 (fast)
            'cmpcvr_and_report'         (somewhat slow)
            'get_status'                (fast) - return status of slow functions.    
                op='get_status' ref='function'
                    where function = one of 'genbif_from_ballots', 'gentemplates', 'genmaprois', 'extractvote'
            
        In the functions below, argsdict is established from the settings file.
        
    """

    if op == 'copy_config_files_to_s3':
        """ This function will copy local config files in EIFs to s3, to simulate
            interaction with the frontend website, which will upload and place files
            s3://us-east-1-audit-engine-jobs/{job_name}/config/ 
            
            Files to be placed there:
                JOB settings file
                EIF file
                BOF file
                manual_styles_to_contests
                style_lookup_table
                
            In local mode running these are in either EIFs/ or input_files/ in repo folder.
                
        """
        DB.upload_file_dirname('config', argsdict['eif'])
        DB.upload_file_dirname('config', argsdict['bof'])
        DB.upload_file_dirname('config', argsdict['manual_styles_to_contests_filename'])
        DB.upload_file_dirname('config', argsdict['style_lookup_table_filename'])
        DB.upload_file_dirname('config', argsdict['input'], local_dirname='input_files')
            
        
        
        
    elif op == 'precheck_job_files':
        """ This function simply does a precheck of the job files that exist
            in the config folder for this job on s3.
        """
        pass
    
    
    
    
    
    
    elif op == 'genbif_from_cvr':
        """ 
        If CVR file(s) are provided with style information included, 
        this operation builds "ballot information file" BIF data by reviewing the CVR
        May also use path information of ballots in archives for precincts, groups, party.
        For Dominion, scan CVR JSON chunks and fill in info about ballots.
        Creates one .csv file for each archive in folder bif.
        This is a relatively fast operation that can be completed typically in a matter of seconds
        Result:
            BIF data file ready for BIF report.
            log
            exception report
        """
        genbif_from_cvr(argsdict)


    elif op == 'genbif_from_ballots':
        """ 
        If no CVR is available, we must scan the ballots to generate the bif.
        Each ballot is reviewed and style information is read from the ballots.
        May also use path information of ballots in archives for precincts, groups, party.
        This can be done by lambdas and should complete within minutes but
        typically will not complete during a single REST post/response.
        Result:
            BIF ready to produce BIF report.
            separate folder for each failing ballot to allow investigation.
            log
            exception report
        """
        genbif_from_ballots(argsdict)
        
    # elif op == 'get_status':
        # """ This function provides status operation in terms of % complete.
        # """
        # if ref == 'genbif_from_ballots':
            # return get_status_genbif_from_ballots(argsdict)
        # elif ref == 'gentemplates':
            # return get_status_gentemplates(argsdict)
        # elif ref == 'genmaprois':
            # return get_status_genmaprois(argsdict)
        # elif ref == 'extractvote':
            # return get_status_extractvote(argsdict)
        # else:
            # utils.sts(f"ref '{ref}' not supported by op=get_status", 3)

    elif op == 'create_bif_report':
        """ 
        as a result of validate_bifs or genbif_from_ballots, this report is 
        generated, or it can be generated once the BIF is built. Report provides:
            Number of Ballot Archives
            Total number of BIF records
            Unique ballot_ids
            Duplicate ballot_ids
            Number of CVR files
            Number of precincts
            Number of parties
            Number of style_nums
            Number of card_codes
            Number of ballots w/o card_codes
            Number of BMD ballots
            Number of corrupted ballots (could not be read)
            Number of different sheets
            Number of each sheet
        
        This operation completes quickly and currently produces a text report to console.
        Can provide alternative data output as JSON or HTML through command line switch.
            
        """
        create_bif_report(argsdict)
        
    elif op == 'build_template_tasklists':
        """ 
        Scan bifs and generate template tasklists, with one tasklist csv file per style.
        tasklist is the same format as bif but should not be updated with any information.
        This generally not used as REST entry point.
        """
        build_template_tasklists(argsdict)

    elif op == 'gentemplates':
        """ this function requires that BIF data is available. Used as REST entry point.
            1. generates template tasklists
            2. contructs templates by combining usually 50 ballots to improve resolution.
            Result is a set of raw templates (PNG files), one for each style,
            and possibly also checkpoint images including the components (up to 50).
            
            This function takes significant time, of more than a minute per style. 
            However, this can be delegated to lambdas and may be completed 
            in (# styles/1000) * time per style, but still too long for single REST POST.
            For Dane County, WI, with 191 styles, it still takes at least a minute.
            If all 10,000 styles are used in SF, time is 10 minutes.
            
            Log file updated.
            Report generated of result.
            PNG files for review of each style.
        """
        if argsdict['include_gentemplate_tasks']:    # sub tasks in gentemplate action - generate base templates
            build_template_tasklists(argsdict)
            
        gentemplates_by_tasklists(argsdict)

    elif op == 'gentemplates_only':
        """ This function used for debugging only when tasklists are already generated.
            Tasklists take only seconds to complete now.
            NOT USED IN REST API
        """
        gentemplates_by_tasklists(argsdict)

    elif op == 'genrois':
        """
        After templates are generated, each style is image-analyzed and then OCR'd.
        Result is set of PNG images providing regions of interest (ROIs) determined.
        Style templates must be generated at this point to allow further analysis and generation of rois
        The json list of rois and the image for each result.
        
        Result:
            Creates a report of rois generated
            PNG image files with graphic outlines of rois that can be reviewed by the user.
        """
        genrois(argsdict)

    elif op == 'maprois':
        """
        Once Rois are generated, they can be fairly quickly mapped to contests and options based on information
        in the EIF - Election Information File. This operates at the rate of several seconds per style.
        Result is 
            PNG "redlines" showing the mapping of contests and options to each style.
            Map report, providing detail of where mapping may have gotten off track.
            Log.
        """
        maprois(argsdict)

    elif op == 'genmaprois':
        """ 
        Major REST entry point.
        This the most typical operation once templates have been generated, which may take
        time and use compute resources. May need to be done repetitively while operator makes
        changes to settings file. Operator must review the map report and redlines.
        Once review is completed, then extraction can commence.
        Can break this up for processing by lambdas but it is so fast now that it may not be necessary.
        Result is:
            PNG images showing ROIS from genrois
            PNG redlines showing the correspondence of contests and options for each style.
            failures copied to assist folder
            Map Report
            Log
        """
    
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'get_assist_requests':
        """ 
        After genmaprois is completed, some styles may need manual assistance by human operator.
        This is used in graphic-mode dominant rois generation rather than OCR dominant generation.
        Front end first requests assist requests, and the response is
            list of ballot_ids which needs assistance.
            path to each template file
            path to existing json file for that template.
            
        NOTE this is a new function which is not implemented yet.
        """
        pass
        
    elif op == 'write_new_assist_annotation':
        """ The front end will implement functionality like is implemented by 
            tools/template_edit.py, to allow the user to add rectangular regions,
            horizontal and vertical lines, to the image.
            Then, this writes a new JSON annodation file.
            Maybe this does not need to be provided if frontend can write to s3 directly.
        
        NOTE this is a new function which is not implemented yet, but is implemented
            for CLI operation as 'template_edit' using tools/template_edit.py
        """
        pass
        
    elif op == 'build_extraction_tasks':
        """ Scan bifs and generate extraction tasklists, with an appropriate number of ballots for each lambda.
            tasklist is the same format as bif and should not be updated with any information by lambda.
            This function completes rapidly and thus is combined with actual extraction.
        """
        build_extraction_tasks(argsdict)

    elif op == 'extractvote_only':
        """ with extraction tasklists already built, go through all the ballots in the 
            archives and extract the marks into single csv data table for each tasklist, 
            and then combine into a single csv file for each archive.
            Each tasklist is delegated to a separate lambda process.
            Each lambda can take up to 15 minutes to process one tasklist. Total time of this
            process is less than (# ballots / 200,000) * 15 minutes.
            So for a county like SF, with 500K ballots, upper limit is about 35 minutes.
            LA, the largest county in the US has about 6 million ballots, upper limit is 7.5 hours.
        """
        extractvote_by_tasklists(argsdict)
        #extractvote(argsdict)

    elif op == 'extractvote':
        """ Build extraction tasklists and then extract vote 
            Perform both the tasklist generation (fast) and extraction (slow) above.
            This is the normal REST entry point.
            Result is 
                marks_df.csv for each archive.
                Extraction Report
                Log
                Exception Report
        """
        # go through all the ballots in the archives and extract the marks into single json file for each archive
        build_extraction_tasks(argsdict)
        extractvote_by_tasklists(argsdict)

    elif op == 'genreport':
        """
        Once extraction is completed, a report of results can be produced independent of the voting 
        system results, or CVR. Can be compared with high-level election results.
        
        Result:
            summary of the election results per audit system.
            Includes total number of ballots:
                not processed by audit system due to misalignment or other corruption.
                not provided in archives.
            Compares with high-level election result.
            
        """
        genreport(argsdict)

    elif op == 'cmpcvr':
        """ If a CVR is available and the voting system evaluation of each ballot
            is provided, then this function compares the audit system result with
            the voting system cvr and provides a comprehensive result.
            This function processes each marks_df.csv that corresponds to each archive, and
            compares each record with CVR, which is fully combined into one data file by this
            function.
            Result:
                cmpresult_n.csv for each archive n processed.
                This file is not combined to a single report.
        """
        cmpcvr_by_tasklists(argsdict)

    elif op == 'gen_cmpcvr_report':
        """ 
        The result of cmpcvr is on an archive-by-archive basis and compares
        the combined CVR, which is generally not organized by archive, with the 
        marks_df.csv which are organized by archive. Creates a ballot-by-ballot
        comparison result on per-archive basis as csv file. Includes any 
        adjudications in the determination of discrepancies.
        Result:
            comprehensive report of the comparison, as JSON or text.
            JSON discrepancy list reduced to just the discrepancies.
            
        """
        generate_cmpcvr_report(argsdict)
        
    elif op == 'cmpcvr_and_report':
        """
        This is a major REST entry point.
        compares the CVR and creates a report by combining the above two functions.
        """
        cmpcvr_by_tasklists(argsdict)
        generate_cmpcvr_report(argsdict)
       
        
    elif op == 'get_discrepancy_list':
        """ new function for front end. After cmpcvr is completed, a full report is created. 
            This provides just the discrepancies to allow for adjudication in frontend UI,
            and the existing adjudication JSON file.
            This is a new function.
            Result:
                JSON list of discrepancies
                log updated.
            NOTE: THIS IS A NEW FUNCTION
        """
        pass
        
    elif op == 'submit_adjudications':
        """ front end will implement a review of all discrepancies and provides
            a DRE-like entry of votes as determined by review of ballot images
            This is a new function.
            Perhaps front end updates the adjudication file but this function 
            may be better so the action is properly logged.
            Results:
                status
                log updated.
            NOTE: THIS IS A NEW FUNCTION
        """
        pass

    # =============================================================================
    #    Updates the lambdas functions.
    # =============================================================================
    
    elif op == 'update_lambda' or op == 'update_lambdas':

        branch = argsdict.get('update_branch', 's3-and-lambdas-dev')

        """ to run this function, you must first delete the tree 'lambda_deploytment'
            including the folder.
        """
        
        function_name = argsdict.get('lambda_function', 'all')
        if function_name == 'all':
           update_lambda(update_all=True, branch=branch)
        else:
            update_lambda(function_name=function_name, branch=branch)

    # =============================================================================
    #    Additional operations only used for development and CLI operation.
    # =============================================================================
    
    elif op == 'post_gentemplate_cleanup':
        post_gentemplate_cleanup(argsdict)
    
    # elif op == 'combine_bif_chunks':
        # """ used for testing combining bif chunks
        # """
        # utils.combine_dirname_chunks_each_archive(argsdict, dirname='bif')
        
        
    elif op == 'get_manual_styles_to_contests':
    
        logs.sts("Processing manual_styles_to_contests", 3)
        style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)
        
        logs.sts(f"style_to_contests_dol:\n {pprint.pformat(style_to_contests_dol)}")

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')


    elif op == 'web2eif':
        """
        This operation scrapes from a url provided a high-level report of results.
        It was thought at the time that this report would provide unique contest names
        and consistent option names, but even though they were shorter and a bit better
        than the CVR, they also were insufficient for our needs. Thus, althought this
        does provide a basic function, it is not up to date with the current EIF format
        and does not eliminate the need for the EIF and manual editing.
        RESEARCH ONLY.
        """
        web_scraper.run_scraper(url=argsdict['url'])
        sys.exit()

    #elif op == 'tidycvr':
    #    """ This operation converts and ES&S cvr to tidy format
    #    Although it is operational, it was found that the existing ES&S format was
    #    a reasonably consice and useful format and we would work with it.
    #    """
    #    tidy_ess_cvr(argsdict)
    #    sys.exit()

    elif op == 'cvr2styles':
        """
        DEPRECATED. Use validate_bifs or genbif_from_ballots
        This operation preprocesses an ES&S CVR file or multiple Dominion CVR files.
        creates two dicts:
        styles_dict, which provides contest list for each style_num
        ballotid_to_style dict, which provides style_num based on ballotid.
        This currently only works if the CVR has a column providding the style named 'Ballot Style'
        Would need a different approach if no Ballot Style column is provided, such as
            creating a logical style iD, perhaps bitstring of contests, and use that as a logcal style identifier.
            This would not match to any style designator on the ballot.
        Proceses multple CVR files one at a time. (scalable)

        convert_cvr_to_styles function is in styles_from_cvr_converter.py
        for dominion, get_styles_to_contests_dominion is in gentemplate.py
        """
        convert_cvr_to_styles(argsdict)

    elif op == 'gentrm':
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'tltrm':
        build_template_tasklists(argsdict)
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'alltemplates':
        """
        Perform all the steps to creation of templates
        """
        genbif_from_cvr(argsdict)
        build_template_tasklists(argsdict)
#        convert_cvr_to_styles(argsdict)
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    # elif op == 'download_results':
        # # download all results from s3 bucket.
        # s3utils.download_entire_dirname(argsdict, dirname='marks')
        # s3utils.get_and_merge_lambda_logs(argsdict)

    elif op == 'download_gentemplates':
        # download all gentemplates from s3 bucket.
        # NOT UPDATED TO NEW FILE STRUCTURE
        DB.download_entire_dirname(dirname='styles')
        #DB.download_entire_dirname(dirname='styles')

    elif op == 'delete_s3_results':
        # delete all results on s3 bucket.
        DB.delete_s3_results(argsdict)

    elif op == 'merge_results':
        """ merge results into single csv file.
        """
        utils.merge_results()

    elif op == 'check_extraction':
        check_extraction(argsdict)

    elif op == 'extractcmp':
        build_extraction_tasks(argsdict)
        extractvote_by_tasklists(argsdict)
        cmpcvr_by_tasklists(argsdict)
  
    # elif op == 'getlogs':
        # DB.get_and_merge_s3_logs()

    elif op == 'plotmetrics':
        plotmetrics()

    elif op == 'evalmarks':
        evalmarks()

    elif op == 'save_failing_ballots':
        # given list of ballots in inputfile, copy the original ballot image files
        # to (jobname)/styles/(ballot_id) folders
        
        # this function
        #   1. builds single bif table.
        #   2. looks each ballot up.
        #   3. using entry, opens the indicated archive and extracts the original file.
        #   4. saves the file in folder of jobname and ballot_id in styles, see above.
        save_failing_ballots(argsdict)

    elif op == 'reprocess_failing_ballots':
    
        reprocess_failing_ballots(argsdict)


    else:
        print("op value not defined ", op)
        sys.exit()
Пример #12
0
def generate_style_template(argsdict: dict,
                            ballots: list,
                            style_num,
                            sheet0=0,
                            omit_ballot_images=False):
    """
    ACTIVE 
    Function which takes a list of Ballot instances and generate
    a new style template with information like ballot code, number
    and regions of interests (ROI). To achieve that, function creates
    a weighted image of ballot based on a list of all passed 'ballots'
    (they should be in similar alignment and shape). Then function looks
    for ROIs and extract data contained within weighted image with OCR tool.
    
    TO MOVE THIS TOWARD IMPLMENTATION COMPATIBLE WITH LAMBDAS
    1. the caller this function should, instead of generating a list of Ballot instances
        with the image already extracted from the file, into just a list of pathnames
        to process. So the Queues.py class should be oriented to just keeping a single
        dict of list structure, where the key of the dict is the style_num, and the
        list containing the ballots pathnames that are of that style.
    2. We must add an intermediate function to make this conversion, which will
        take that list and for each ballot, open it and load the images for each file, 
        and then call this function. Let's assume we call that function
        'generate_style_template_from_paths(ballot_paths: list, style_num)'
        It will be the appropriate operation type that can be ported to work on lambdas.
    3. The result of this function will be only the combined template. It will be
        reasonable to continue with the subsequent steps for this style, such as
        genrois and maprois. Those functions take the combined template plus
        EIF file information to finally generate at roismap_df for the style.
        Each roismap_df is combined together after all lambdas are competed to 
        produce the roismap_df which is later used in the extraction process.
    4. Result of style generation lambda will be:
        1. list of pathnames actually used in the style generation, in cause some were
            inappropriate or unusable.
        2. roismap_df for that style.
        3. combined template with redlines of the rois that are mapped to it.
        
    sheet value is simply added to the style dict. The sheet is used for any later 
    drawing of lines which may only be appropriate for one of the sheets.
        
    """
    #use_sync_timing = True

    utils.sts(
        f"Generating ballot style templates for style {style_num} using {len(ballots)} ballots...",
        3)
    if not ballots:
        utils.exception_report(
            "generate_style_template: List of ballots is empty")
        return False

    #ballots.sort(key=sum_determinants)
    ballots = ballots[:config_dict[
        'LAYERS_FOR_EMPTY_BALLOT']]  # consider first ballots. Maybe better to choose ballots with least stretch
    style = Style(style_num=style_num)
    style.sheet0 = sheet0
    style.target_side = argsdict['target_side']
    style.build_from_count = len(ballots)
    style.precinct = ballots[0].ballotdict['precinct']
    style.build_from_ballots = [
        ballot.ballotdict['ballot_id'] for ballot in ballots
    ]
    weighted_images = []
    pages = range(len(ballots[0].ballotimgdict['images']))

    utils.sts("Generating the average timing marks for minimal corrections", 3)
    std_ballot_num = choose_unstretched_ballot(ballots)

    utils.sts("stretch_fix all ballots to std_timing_marks", 3)
    stretch_fix_ballots(argsdict, ballots, std_ballot_num)

    # first save them so we can diagnose any problem.
    if argsdict['save_checkpoint_images'] and not omit_ballot_images:
        utils.sts("Saving checkpoint images...", 3)
        #confirmed this is working to s3.
        save_style_ballot_images(ballots, style_num)

    utils.sts("Combining images to create template for each page...", 3)
    for page in pages:

        if not (page and
                (ballots[0].ballotdict.get('p1_blank', False)
                 or not ballots[0].ballotdict.get('timing_marks', []))):

            weighted_images.append(get_weighted_image_from_page(page, ballots))

    # image templates must be saved outside style
    utils.sts("Saving style template images...", 3)
    style.filepaths = save_style_template_images(style_num, weighted_images)

    style.timing_marks = ballots[std_ballot_num].ballotdict['timing_marks']

    utils.sts("Saving style object...", 3)
    #DB.save_style(name=style_num, style_data=vars(style))
    DB.save_data(data_item=vars(style),
                 dirname='styles',
                 subdir=style_num,
                 name=f'{style_num}_style')
    """
    style_dict saved at this point:
        'build_from_count':     int number of ballots included in the generation of the template
        'precinct':             str precinct designation
        'build_from_ballots':   list of ballot_ids that were used to build the template.
        'filepaths':            list of template files produced
    """
    utils.sts("Saved combined image tamplates...", 3)
    return True
def build_dirname_tasks(argsdict, dirname, subdir=None, ballots_per_chunk=200):
    """ with all bif chunks created, scan them and create tasks in dirname.
        each task contains records from bif for ballots to be included
        in the processing chunk. These are written to extraction_tasklists folder.
        For lambdas processing mode, these tasklists could launch an extraction lambda
    """

    utils.sts(f"Building tasklists to {dirname}/{subdir}...", 3)

    bifpaths = get_biflist(argsdict)     # returns either s3path list or pathlist, depending on argsdict['use_s3_results']
    max_concurrency = argsdict.get('max_lambda_concurrency', 1000)

    tasks_queued = 0
    total_ballots_queued = 0
    
    DB.delete_dirname_files_filtered(dirname=dirname, subdir=subdir)

    for bif_pathname in bifpaths:
        utils.sts(f"  Processing bif {bif_pathname}...", 3)
        BIF.load_bif(bif_pathname=bif_pathname)        # uses s3 based on DB.MODE
        bif_basename = os.path.basename(bif_pathname)
        archive_name = re.sub(r'_bif\.csv$', '', bif_basename)

        reduced_df = BIF.df_without_corrupted()
        
        # the following should be moved to bif generation phase (generally not done)
        reduced_df = set_style_from_party_if_enabled(argsdict, reduced_df)

        # the following reduces the ballots selected based on input
        # parameters and whether the ballots have been successfully mapped.
        filtered_df = filter_extraction_ballots(argsdict, reduced_df)
        
        sorted_df = filtered_df.sort_values(by=['cvr_file'])     #ascending - bool or list of bool, default True; inplace - bool, default False


        num_ballots_in_bif = len(BIF.df.index)
        num_to_be_extracted = len(sorted_df.index)
        num_excluded = num_ballots_in_bif - num_to_be_extracted

        utils.sts(f"Total of {num_ballots_in_bif} ballots, {num_to_be_extracted} to be extracted, {num_excluded} ballots excluded.", 3)
        if not num_to_be_extracted:
            continue
            
        chunks_lodf = utils.split_df_into_chunks_lodf(df=sorted_df, max_chunk_size=ballots_per_chunk, max_concurrency=max_concurrency)
        num_chunks = len(chunks_lodf)

        utils.sts(f"Split into {num_chunks} chunks, each with no more than {ballots_per_chunk} ballots each.")

        for chunk_index, chunk_df in enumerate (chunks_lodf):
            chunk_name = f"{archive_name}_chunk_{'%4.4u' % (chunk_index)}.csv"
            utils.sts(f"Creating {dirname} chunk: {chunk_name}...", 3)
            
            DB.save_data(
                data_item=chunk_df, 
                dirname=dirname,
                subdir=subdir,
                name=chunk_name, 
                )
            tasks_queued += 1
            total_ballots_queued += len(chunk_df.index)


    utils.sts(f"Total of {tasks_queued} {dirname} tasks queued with a total of {total_ballots_queued} ballots.", 3)
def save_dirname_chunk_by_idx(dirname, group_name, chunk_idx: int, chunk_df, s3flag=None):
    file_name = create_dirname_chunk_filename(dirname, group_name, chunk_idx)
    DB.save_data(data_item=chunk_df, dirname=dirname, name=file_name, format='.csv', s3flag=s3flag)
def genbif_from_cvr(argsdict: dict):
    """
        If CVR files are available with style information, this 
        function can be used to generate the BIF data file.
        
        THIS RUNS VERY FAST NOW, do not neet lambdas if CVR exsists.
    """

    utils.sts('Generating BIFs')

    # if cvr is provided, us it for information here.
    ballotid_to_style_dict, parsed_dominion_cvr = get_cvr_info(argsdict)

    # check to see if style lookup table is specified.
    style_lookup_table_df = get_style_lookup_table(argsdict)
    
    pstyle_region_str = argsdict.get('pstyle_region')
    pstyle_region_dict = json.loads(pstyle_region_str) if (pstyle_region_str) else None
    pstyle_pattern = argsdict.get('pstyle_pattern', '')
    vendor = argsdict.get('vendor')

    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive_root = os.path.splitext(archive_basename)[0]
        archive = open_archive(argsdict, archive_basename)

        df_dict = {}        # to save time, we will build the dataframe as a dict of dict, then in one swoop create the dataframe.
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        # now scan archives for additional information.

        for index, file_path in enumerate(file_paths):
            style = card_code = ballot_type_id = ''
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            _, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            # initialize defaults in local dict
            bifdict = {c: '' for c in BIF.get_bif_columns()}
            party = bifdict['party'] = get_party(argsdict, file_path)
            precinct = bifdict['precinct'] = get_precinct(argsdict, file_path)
            bifdict['sheet0'] = '0'
            
            #utils.sts(f"Processing {ballot_id} precinct {precinct} party {party}", 3)
            if vendor == 'Dominion':
                if parsed_dominion_cvr:
                    try:
                        ballot_rec = parsed_dominion_cvr[ballot_id]
                    except KeyError:
                        bifdict['comments'] = "Couldn't find ballot id in the CVR dict"
                    else:
                        for field in ['style_num', 'cvr_name', 'card_code', 'ballot_type_id']:
                            bifdict[field] = ballot_rec[field]
                        bifdict['is_bmd'] = '1' if ballot_rec['is_bmd'] else '0'
                        bifdict['sheet0'] = str(ballot_rec['sheet0'])

                else:
                    try:
                        style_num = str(ballotid_to_style_dict[ballot_id])
                    except (KeyError, TypeError):
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in ballotid_to_style_dict. Skipping.")
                        continue
                    bifdict['style_num'] = bifdict['card_code'] = style_num

                # the following creates the CONV_card_code_TO_ballot_type_id_DICT
                card_code = bifdict['card_code']
                
                update_CONV_card_code_TO_ballot_type_id_DICT(card_code, ballot_type_id)

            elif vendor == 'ES&S':

                is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                bifdict['is_bmd'] = '1' if is_bmd else '0'

                if ballotid_to_style_dict:
                    try:
                        style = str(ballotid_to_style_dict[int(ballot_id)])
                    except KeyError:
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in cvr. Skipping.")
                        continue
                    card_code = style
                    
                elif style_lookup_table_df is not None:
                    # style lookup table has been specified and loaded. 
                    # look up style based on party and precinct values from path.
                    #To select a row based on multiple conditions you can use &:
                    
                    try:
                        lookup_row = style_lookup_table_df.loc[(style_lookup_table_df['party'] == party) & (style_lookup_table_df['precinct'] == int(precinct))]
                    except Exception as err:
                        utils.exception_report(f"style lookup table format problem: {err}")
                        sys.exit(1)
                    if len(lookup_row) > 1:
                        utils.exception_report(f"Duplicate row values in style lookup table: {lookup_row}")
                    
                    is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                    bifdict['is_bmd'] = '1' if is_bmd else '0'
                    bifdict['style_num'] = str(lookup_row['style_num'].values.item())
                    bifdict['archive_basename'] = archive_basename
                    bifdict['ballot_id'] = ballot_id
                    bifdict['file_paths'] = ';'.join(ballot_file_paths)
                    bifdict['card_code'] = str(lookup_row['card_code'].values.item())
                
                else:
                    # if we do not have the ballot_id_to_style dict, this happens if there is no CVR.
                    # we must determine the style and bmd status by inspection of ballots.
                    # this can be very time consuming!
                    # NOTE: should use genbif_from_ballots
                   

                    # @@ Should check to see if bif files already exist and appear to have the correct number of records.
                    bifdict = create_bif_dict_by_reading_ballot(argsdict, ballot_id, index, archive_basename, archive, ballot_file_paths,
                                                                pstyle_region_dict, pstyle_pattern)

            df_dict[index] = bifdict

        # create the dataframe all at once.
        df = pd.DataFrame.from_dict(df_dict, "index")
        DB.save_data(data_item=df, dirname='bif', name=f"{archive_root}_bif.csv")
def extractvote_by_one_tasklist(
        argsdict: dict,
        tasklist_name: str,
        ):
    """ ACTIVE
    
    Extract vote from all ballots as specified in tasklist chunk in extraction_tasks folder.

    params:
    :param argsdict: provides arguments from input file or CLI such as filter specs.
    :param tasklist_name: created by f"{BIF.name}_chunk_{'%4.4u' % (chunk_index)}.csv"
            tasklist is found in extaction_tasks folder.

    produces results/marks_{tasklist_name}

    This is the primary extraction function for lambda operation.
    
    PRIOR TO LAUNCHING THIS:
        Check availability of:
            styles/rois_map_df.csv      -- as a result of gentemplates, genrois, genmap
            styles/contests_dod.json    -- based on EIF
            

    """

    current_archive_basename = ''
    archive = None

    # set s3 vs local mode
    DB.set_DB_mode()        

    # initialize results.
    DB.BALLOT_MARKS_DF = pd.DataFrame()
    
    rois_map_df      = DB.load_data('styles', 'roismap.csv')
    contests_dod     = DB.load_data('styles', 'contests_dod.json')

    #extraction_tasks_df = DB.load_df_csv(name=tasklist_name, dirname='extraction_tasks', s3flag=argsdict['use_s3_results'])
    extraction_tasks_df = DB.load_data(dirname='marks', subdir='tasks', name=tasklist_name)

    #archives_folder_path = argsdict['archives_folder_path']

    for task_idx in range(len(extraction_tasks_df.index)):

        task_dict           = extraction_tasks_df.iloc[task_idx]
        ballot_id           = task_dict['ballot_id']
        precinct            = task_dict['precinct']
        archive_basename    = task_dict['archive_basename']

        """ has structure of BIF
            ('archive_basename', str),
            ('ballot_id', str),
            ('file_paths', str),    # note, may be semicolon separated list.
            ('cvr_file', str),
            ('precinct', str),
            ('party', str),
            ('style_num', str),
            ('card_code', str),
            ('ballot_type_id', str),
            ('sheet0', 'Int32'),                 # 0, 1 ...
            ('is_bmd', 'Int32'),
            ('style_roi_corrupted', 'Int32'),
            ('other_comments', str),
        """

        ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict)

        #ballot_id, vendor='ES&S', precinct=None, party=None, group=None, extension=None, file_paths=[]):
        # this call does nothing more than initialize the instance data
        ballot = Ballot(argsdict, 
            file_paths = re.split(r';', task_dict['file_paths']), 
            ballot_id=ballot_id, 
            precinct=precinct, 
            archive_basename=archive_basename)

        ballot.ballotdict['is_bmd'] = bool(utils.set_default_int(task_dict.get('is_bmd', 0), 0))

        if (ballot.ballotdict['is_bmd'] and not argsdict['include_bmd_ballot_type'] or
            not ballot.ballotdict['is_bmd'] and not argsdict['include_nonbmd_ballot_type']):

            utils.exception_report(f"Tasklist says is_bmd is {ballot.ballotdict['is_bmd']} "
                "but argsdict does not include that type. Extract tasklists may be stale")
            continue

        if archive_basename != current_archive_basename:
            if current_archive_basename and archive:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            string = f"EXCEPTION: Could not load source files from archive {archive_basename} offset {task_idx} for ballot_id: {ballot_id} Precinct: {precinct}"
            utils.exception_report(string)
            continue

        utils.sts(f"\n{'-'*50}\nProcessing tasklist:{tasklist_name} offset: {task_idx} ballot_id:{ballot_id}", 3)

        ballot.get_ballot_images()      # this reads images from PDFs

        #-----------------------------------------------------
        # this is the primary function call, performed for each ballot,
        # and producing a marks_df for this ballot, with one record for
        # each option.
        
        ballot_marks_df = extract_vote_from_ballot(
            argsdict, ballot, rois_map_df, contests_dod,
            ballot_style_overrides_dict,
            )
            
        # the above function makes exception reports if:
        #   1. the style cannot be read from the ballot, alignment or barcode error.
        #   2. the style failed to map.
        #-----------------------------------------------------

        if ballot_marks_df is None or not len(ballot_marks_df.index):
            continue    # not successful and exception has already been logged.

        DB.BALLOT_MARKS_DF = DB.BALLOT_MARKS_DF.append(ballot_marks_df, sort=False, ignore_index=True)
        continue

    #DB.save_df_csv(name=tasklist_name, dirname='marks', df=DB.BALLOT_MARKS_DF)
    DB.save_data(data_item=DB.BALLOT_MARKS_DF, dirname='marks', subdir='chunks', name=f"marks_{tasklist_name}")