Python open_archive示例

编程语言: Python

命名空间/包名称: utilities.zip_utils

方法/功能: open_archive

hotexamples.com的示例: 7

Python open_archive - 已找到7个示例。这些是从开源项目中提取的最受好评的utilities.zip_utils.open_archive现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： bif_utils.py 项目： iero1997/audit-engine-s3-and-lambdas-dev

def delegated_build_bif_chunk(dirname, task_args, s3flag=None):
    """ this function is suitable for execution in lambda after delegation
        can also use by local machine even if s3 is used for output.
    """

    # task_args: argsdict, archive_basename, chunk_idx, filelist
    args.argsdict = argsdict = task_args['argsdict']
    
    chunk_idx   = task_args['chunk_idx']
    filelist    = task_args['filelist']                         # the list of files to be processed in this chunk.
    subdir      = task_args['subdir']
    chunk_name  = task_args['chunk_name']
    
    archive_basename = task_args['group_name']
    archive = open_archive(argsdict, archive_basename)          # if using s3, this will open the archive on s3.
    full_file_list = get_file_paths(archive)
    if not full_file_list:
        raise LookupError(f"archive {archive_basename} appears empty")

    pstyle_region_dict = argsdict.get('pstyle_region')
    pstyle_pattern = argsdict.get('pstyle_pattern', '')

    df_dict = {}        # to save time, we will build the dataframe as a dict of dict, then in one swoop create the dataframe.
                        # format is {1: {'lkadsjf': asdlkfj, }, 2: {...} ...)
    
    #filelist = filelist[0:5]
    for index, file_paths in enumerate(filelist):
    
        ballot_file_paths = re.split(r';', file_paths)
        _, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

        df_dict[index] = create_bif_dict_by_reading_ballot(argsdict, 
                                                            ballot_id, 
                                                            index, 
                                                            archive_basename, 
                                                            archive, 
                                                            ballot_file_paths,
                                                            pstyle_region_dict, 
                                                            pstyle_pattern,
                                                            chunk_idx)
    # create the dataframe all at once.
    #print(df_dict)
    chunk_df = pd.DataFrame.from_dict(df_dict, "index")

    DB.save_data(data_item=chunk_df, dirname=dirname, subdir=subdir, name=chunk_name, format='.csv', s3flag=s3flag)

示例#2

显示文件

文件： bif_utils.py 项目： iero1997/audit-engine-s3-and-lambdas-dev

def save_failing_ballots(argsdict):
    """ given list of ballots in inputfile, copy the original ballot image files
        to (job_folder_path)/styles/(ballot_id) folders
        
        this function
            1. builds single bif table.
            2. looks each ballot up.
            3. using entry, opens the indicated archive and extracts the original file.
            4. saves the file in folder of jobname and ballot_id in styles, see above.
    """
    
    full_bif_df = combine_archive_bifs()
    
    ballot_list = argsdict['ballotid']
    
    #archives_folder_path = argsdict['archives_folder_path']
    opened_archive_basename = ''
    archive = None
    
    for ballot_id in ballot_list:
        utils.sts(f"processing ballot_id:{ballot_id}", 3)
        rows = full_bif_df.loc[full_bif_df['ballot_id'] == ballot_id]       # select set of rows with value in column_name equal to some_value.
        
        archive_basename = rows['archive_basename'].values.item()     # return one item from a row
        file_paths_str = rows['file_paths'].values.item()
        file_paths = file_paths_str.split(';')
        
        dest_dirpath = DB.dirpath_from_dirname('styles')
        
        if archive_basename != opened_archive_basename:
            if opened_archive_basename:
                archive.close()
            archive = open_archive(argsdict, archive_basename)
            opened_archive_basename = archive_basename
            
        for file_path in file_paths:
            basename = os.path.basename(file_path)
            dest_filepath = os.path.join(dest_dirpath, ballot_id, basename)
            extract_file(archive, file_path, dest_filepath)
            utils.sts(f"...extracted:{file_path} to {dest_filepath}", 3)
        
    if opened_archive_basename:
        archive.close()

示例#3

显示文件

def generate_template_for_style_by_tasklist_lod(argsdict: dict,
                                                tasklist_lod: list = None):
    """ ACTIVE
        This function is driven by a preselected set of ballots listed in BIF format.
        This list is prefiltered to exclude BMD ballots, and the ballots are all of the
        same physical style so they can be combined to produce a template with higher
        resolution and which largely excludes random marks. Generates a set of template images
        1. opens the files either from local zip archives or on s3 bucket (already unzipped.)
        2. aligns the images to alignment targets.
        3. reads the barcode style and checks it with the card_code (which may differ from the style_num)
        4. gets the timing marks.
        5. calls generate_style_template(), which:
            a. reviews the images and chooses the most average image in terms of stretch.
            b. discards any excessively stretched images.
            c. stretch-fixes the rest on timing-mark basis to "standard" timing marks.
            d. combines into one image.
            e. saves style information as JSON.
    """
    global archive
    global current_archive_basename
    current_archive_basename = ''

    ballot_queue = []
    ballots_unprocessed = []
    tot_failures = 0

    #if not tasklist_lod:
    #    tasklist_lod = tasklist_df.to_dict(orient='records')
    for task_idx, item_dict in enumerate(tasklist_lod):
        #import pdb; pdb.set_trace()

        archive_basename = item_dict['archive_basename']
        ballot_file_paths = re.split(r';', item_dict['file_paths'])
        precinct = item_dict['precinct']
        sheet0 = item_dict['sheet0']
        card_code = item_dict['card_code']
        style_num = item_dict['style_num']      # will be the same for all records.

        ballot      = Ballot(argsdict, file_paths=ballot_file_paths, archive_basename=archive_basename)    # initialize and derive ballot_id, precinct, party, group, vendor
        ballot_id   = ballot.ballotdict['ballot_id']
        precinct    = ballot.ballotdict['precinct']

        utils.sts (f"gentemplate_by_tasklist for "
                    f"style_num:{style_num} "
                    f"item:{task_idx} "
                    f"in archive {archive_basename} "
                    f"ballotid:{ballot_id} "
                    f"in precinct:'{precinct}'...", 3)

        if archive_basename != current_archive_basename:
            if current_archive_basename:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            utils.exception_report(f"EXCEPTION: Could not load source files from archive {archive_basename} "
                                    f"item:{task_idx} for ballot_id: {ballot_id} Precinct: {precinct}")
            continue
        ballot.get_ballot_images()
        ballot.align_images()
        read_style_num = ballot.read_style_num_from_barcode(argsdict)
        if not argsdict.get('style_from_party', None) and not argsdict.get('style_lookup_table_path', ''):
            if str(read_style_num) != str(card_code):
                utils.exception_report(f"Style {read_style_num} in ballot {ballot_id} doesn't match style card_code {card_code} from tasklist")
                #add_instruction(bif_name=source_name, ballot_id=ballot_id, column='style_num', value=f'not matched to {style_num}')
                ballots_unprocessed.append(ballot_id)
                continue
        #add_instruction(bif_name=archive_basename, ballot_id=ballot_id, column='style_num', value=style_num)

        ballot.get_timing_marks()       # for each image, capture the timing marks to ballot instance.
                                        # note that sometimes timing marks are not available on page 1.

        if not are_timing_marks_consistent(ballot.ballotdict['timing_marks']):
            utils.exception_report(f"EXCEPTION: Timing mark recognition failed: ballot_id: {ballot_id} Precinct: {precinct}")
            tot_failures += 1
            continue
        ballot_queue.append(ballot)

    utils.sts(f"Generating Style Template from {len(ballot_queue)} ballots (omitted {tot_failures} failed ballots)...", 3)
    if generate_style_template(argsdict, ballot_queue, style_num, sheet0):
        utils.sts(f"Style templates generation completed successfully.\n Processed a total of {len(ballot_queue)} ballots", 3)
        return True
    else:
        utils.sts("Style templates generation FAILED.", 3)
        return False

示例#4

显示文件

def get_styles_to_contests_dominion(argsdict, ballot_type_contest_manifest='BallotTypeContestManifest.json',
                                    contest_manifest='ContestManifest.json', just_ids=False, silent_error=False):
    """
    
    Builds a styles to contests dict, where styles are ballot type_id.
    It requires BIF files to work and "BallotTypeContestManifest.json",
    "ContestManifest.json" like files in the Dominion CVR ZIP.
    :just_ids: Set to True returns "styles_to_contests_dict.json" with
    "ballot_type_id > contest_ids" instead of "ballot_type_id > contest_names".
    
    Assumes the various manifest files are in a single cvr zip file.
    
    """
    contest_id_to_names = {}
    ballot_type_to_contests = {}
    cvr_file = argsdict.get('cvr')[0]
    utils.sts(f'Loading CVR {cvr_file}')
    cvr_archive = open_archive(argsdict, cvr_file, testzip=False, silent_error=silent_error)

    # First open contests manifest to build dict
    # contest id > contest name.
    try:
        with cvr_archive.open(contest_manifest) as manifest_file:
            utils.sts(f'Loaded {manifest_file}')
            data = json.loads(manifest_file.read()).get('List')
    except (FileNotFoundError, ValueError) as error:
        if not silent_error:
            logs.exception_report(f"Could not load {contest_manifest} from CVR archive {cvr_file} due to %s", error)
            sys.exit(1)
        else:
            return None
    
    utils.sts(f'Loaded manifest data, {len(data)} rows found')
    for row in data:
        contest_id = str(row.get('Id'))
        contest_name = row.get('Description')
        contest_id_to_names[contest_id] = contest_name
    utils.sts(f'Ballot type ids to contests dict built, {len(contest_id_to_names)} rows found')
    del data

    # Then open ballot type confest manifest to build dict
    # ballot type id > list of contest names/ids.
    try:
        with cvr_archive.open(ballot_type_contest_manifest) as manifest_file:
            utils.sts(f'Loaded {manifest_file}')
            data = json.loads(manifest_file.read()).get('List')
    except (FileNotFoundError, ValueError) as error:
        if not silent_error:
            logs.exception_report(f"Could not load {manifest_file} from CVR archive {cvr_file} due to %s", error)
            sys.exit(1)
        else:
            return None
            
    utils.sts(f'Loaded manifest data, {len(data)} rows found')
    for row in data:
        type_id = row.get('BallotTypeId')
        contest_id = str(row.get('ContestId'))
        contest_name = contest_id_to_names.get(contest_id) if not just_ids else contest_id
        if not ballot_type_to_contests.get(type_id):
            ballot_type_to_contests[type_id] = [contest_name]
        else:
            ballot_type_to_contests[type_id].append(contest_name)
    utils.sts(f'Ballot type ids to contests dict built, {len(ballot_type_to_contests)} rows found')
    del data

    #DB.save_json('styles', f"{config_dict['CVR_STYLE_TO_CONTESTS_DICT_FILENAME']}.json", ballot_type_to_contests)
    DB.save_data(ballot_type_to_contests, 'styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')

    
    # at this point, the styles_to_contests dict of list is created, where the key is the ballot_type_id
    # for each style in this list, split it between pages.    

    if False: # this needs to be updated. Not currently used. argsdict['merge_similar_styles']:
    
        # NOTE: It is invalid to merge styles at this point, only based on the contests in them, due to language differences.
        
        # for a given sheet, we may be able to merge styles while still respecting language differences. 
        # given ballot_type_id, look up contests on ballot.
        # using EIF, split contest list into separate list for each sheet.
        # for this sheet, compare list of contests with sheet_based_style_list
       
        contests_dod = create_contests_dod(argsdict)    # this reads the EIF
        
        sheetstyle_dol = {}

        for type, contest_list in ballot_type_to_contests.items():
            
            grouped_dol = utils.group_list_by_dod_attrib(contest_list, contests_dod, 'sheet0')    
                # Access EIF to get the sheet information for each contest.
                # this produces a dict with groups names for each sheet value
                
                # input might be: contest_list = ['contest1', 'contest2', contest3,... ]
                #                 contests_dod = {'contest1': {'sheet0':0}, 'contest2', {'sheet0':0}, 'contest3': {'sheet0':1}, 'contest4': {'sheet0':1},... ]
                # output: grouped_dol {0: ['contest1', 'contest2'], 1: ['contest3', 'contest4'] }
                
            for sheet0, contest_list in grouped_dol.items():
                if not contest_list: continue
                
                sheetstyle_num = "%1.1u%3.3u" % (sheet0 + 1, type)
                sheetstyle_dol[sheetstyle_num] = contest_list
                
        # now each ballot_type_id, which includes the contests for all sheets, has been split 
        # into separate styles for each sheet, and with only those contests for that sheet included.
        
        reduced_sheetstyle_dict, sheetstyle_map_dict = utils.reduce_dict(sheetstyle_dol)
        
        # the reduced_sheetstyle_dict includes a minmal subset of those sheetstyles that are unique.
        # the sheetstyle_map_dict provides a way to find the same list using the redundant key.
        
        #DB.save_json('styles', 'reduced_sheetstyle_dict.json', reduced_sheetstyle_dict)
        DB.save_data(reduced_sheetstyle_dict, 'styles', name='reduced_sheetstyle_dict.json')
        #DB.save_json('styles', 'sheetstyle_map_dict.json', sheetstyle_map_dict)
        DB.save_data(sheetstyle_map_dict, 'styles', name='sheetstyle_map_dict.json')

示例#5

显示文件

文件： bif_utils.py 项目： iero1997/audit-engine-s3-and-lambdas-dev

def genbif_from_ballots(argsdict: dict):
    """
    This function is used when no cvr exists and we need to scan all the
    ballots to create bifs. This is a slow process, so we create
    tasklist for lambdas processing.
    """

    if argsdict['use_s3_results']:
        DB.delete_dirname_files_filtered(dirname='bif', s3flag=True, file_pat=None)
        DB.delete_dirname_files_filtered(dirname='bif', subdir='chunks', s3flag=True, file_pat=None)

    # Clear lambda tracker catche
    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    max_chunk_size = argsdict.get('genbif_ballots_per_chunk', 200)
    max_concurrency = argsdict.get('max_lambda_concurrency', 1000)
    chunk_limit = argsdict.get('genbif_chunk_limit', None)
    num_archives = len(argsdict['source'])
    max_concurrency = max_concurrency // num_archives

    utils.sts('Generating tasklists to scan ballots to create bifs')
    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive = open_archive(argsdict, archive_basename) # will open on s3 directly if using s3
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        filelist = []
        for index, file_path in enumerate(file_paths):
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            #_, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            filelist.append( ';'.join(ballot_file_paths) )
        utils.sts(f"Total of {len(filelist)} ballots in the archive")
        archive.close()

        chunks_lol = utils.split_list_into_chunks_lol(item_list=filelist, max_chunk_size=max_chunk_size, max_concurrency=max_concurrency)
        num_chunks = len(chunks_lol)
        utils.sts(f"Split into {num_chunks} chunks with maximum of {max_chunk_size} ballots each.")
        #count = 0
        
        # The loop below may delegate processing to lambdas.
        # Should perform consistency checks here (or before this point) to avoid any costly errors, such as:
        #   1. output bucket specified exists and is writeable.
        # It would be best to make these checks as settings file is initially processed.
        
        
        for chunk_idx, filelist in enumerate(chunks_lol):
            if chunk_limit and chunk_idx >= chunk_limit:
                break
            utils.sts(f"Processing chunk #{chunk_idx} with {len(filelist)} ballots", 3)
            
            build_one_chunk(
                argsdict=argsdict,
                dirname='bif',
                subdir='chunks',
                chunk_idx=chunk_idx, 
                filelist=filelist, 
                group_name=archive_basename, 
                task_name='bif',
                incremental = argsdict['incremental_genbif']
                )   # this may delegate to one lambda
            #count = count+1
            if argsdict['use_lambdas'] and not archive_idx and not chunk_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='bif'):
                    utils.exception_report("task 'bif' failed delegation to lambdas.")
                    sys.exit(1)           


    wait_for_lambdas(argsdict, task_name='bif')      # @@ wait_for_lambdas should be enhanced to track specific tasks or better use SQS messaging.
    
    for archive_idx, source in enumerate(argsdict['source']):
        archive_rootname = os.path.splitext(os.path.basename(source))[0]

        dirname = 'bif'

        DB.combine_dirname_chunks(
            dirname=dirname, subdir='chunks', 
            dest_name=f"{archive_rootname}_{dirname}.csv", 
            file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv")
            
        logs.get_and_merge_s3_logs(dirname='bif', rootname='log', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='bif', rootname='exc', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')

示例#6

显示文件

文件： bif_utils.py 项目： iero1997/audit-engine-s3-and-lambdas-dev

def genbif_from_cvr(argsdict: dict):
    """
        If CVR files are available with style information, this 
        function can be used to generate the BIF data file.
        
        THIS RUNS VERY FAST NOW, do not neet lambdas if CVR exsists.
    """

    utils.sts('Generating BIFs')

    # if cvr is provided, us it for information here.
    ballotid_to_style_dict, parsed_dominion_cvr = get_cvr_info(argsdict)

    # check to see if style lookup table is specified.
    style_lookup_table_df = get_style_lookup_table(argsdict)
    
    pstyle_region_str = argsdict.get('pstyle_region')
    pstyle_region_dict = json.loads(pstyle_region_str) if (pstyle_region_str) else None
    pstyle_pattern = argsdict.get('pstyle_pattern', '')
    vendor = argsdict.get('vendor')

    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive_root = os.path.splitext(archive_basename)[0]
        archive = open_archive(argsdict, archive_basename)

        df_dict = {}        # to save time, we will build the dataframe as a dict of dict, then in one swoop create the dataframe.
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        # now scan archives for additional information.

        for index, file_path in enumerate(file_paths):
            style = card_code = ballot_type_id = ''
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            _, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            # initialize defaults in local dict
            bifdict = {c: '' for c in BIF.get_bif_columns()}
            party = bifdict['party'] = get_party(argsdict, file_path)
            precinct = bifdict['precinct'] = get_precinct(argsdict, file_path)
            bifdict['sheet0'] = '0'
            
            #utils.sts(f"Processing {ballot_id} precinct {precinct} party {party}", 3)
            if vendor == 'Dominion':
                if parsed_dominion_cvr:
                    try:
                        ballot_rec = parsed_dominion_cvr[ballot_id]
                    except KeyError:
                        bifdict['comments'] = "Couldn't find ballot id in the CVR dict"
                    else:
                        for field in ['style_num', 'cvr_name', 'card_code', 'ballot_type_id']:
                            bifdict[field] = ballot_rec[field]
                        bifdict['is_bmd'] = '1' if ballot_rec['is_bmd'] else '0'
                        bifdict['sheet0'] = str(ballot_rec['sheet0'])

                else:
                    try:
                        style_num = str(ballotid_to_style_dict[ballot_id])
                    except (KeyError, TypeError):
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in ballotid_to_style_dict. Skipping.")
                        continue
                    bifdict['style_num'] = bifdict['card_code'] = style_num

                # the following creates the CONV_card_code_TO_ballot_type_id_DICT
                card_code = bifdict['card_code']
                
                update_CONV_card_code_TO_ballot_type_id_DICT(card_code, ballot_type_id)

            elif vendor == 'ES&S':

                is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                bifdict['is_bmd'] = '1' if is_bmd else '0'

                if ballotid_to_style_dict:
                    try:
                        style = str(ballotid_to_style_dict[int(ballot_id)])
                    except KeyError:
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in cvr. Skipping.")
                        continue
                    card_code = style
                    
                elif style_lookup_table_df is not None:
                    # style lookup table has been specified and loaded. 
                    # look up style based on party and precinct values from path.
                    #To select a row based on multiple conditions you can use &:
                    
                    try:
                        lookup_row = style_lookup_table_df.loc[(style_lookup_table_df['party'] == party) & (style_lookup_table_df['precinct'] == int(precinct))]
                    except Exception as err:
                        utils.exception_report(f"style lookup table format problem: {err}")
                        sys.exit(1)
                    if len(lookup_row) > 1:
                        utils.exception_report(f"Duplicate row values in style lookup table: {lookup_row}")
                    
                    is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                    bifdict['is_bmd'] = '1' if is_bmd else '0'
                    bifdict['style_num'] = str(lookup_row['style_num'].values.item())
                    bifdict['archive_basename'] = archive_basename
                    bifdict['ballot_id'] = ballot_id
                    bifdict['file_paths'] = ';'.join(ballot_file_paths)
                    bifdict['card_code'] = str(lookup_row['card_code'].values.item())
                
                else:
                    # if we do not have the ballot_id_to_style dict, this happens if there is no CVR.
                    # we must determine the style and bmd status by inspection of ballots.
                    # this can be very time consuming!
                    # NOTE: should use genbif_from_ballots
                   

                    # @@ Should check to see if bif files already exist and appear to have the correct number of records.
                    bifdict = create_bif_dict_by_reading_ballot(argsdict, ballot_id, index, archive_basename, archive, ballot_file_paths,
                                                                pstyle_region_dict, pstyle_pattern)

            df_dict[index] = bifdict

        # create the dataframe all at once.
        df = pd.DataFrame.from_dict(df_dict, "index")
        DB.save_data(data_item=df, dirname='bif', name=f"{archive_root}_bif.csv")

示例#7

显示文件

文件： votes_extractor.py 项目： iero1997/audit-engine-s3-and-lambdas-dev

def extractvote_by_one_tasklist(
        argsdict: dict,
        tasklist_name: str,
        ):
    """ ACTIVE
    
    Extract vote from all ballots as specified in tasklist chunk in extraction_tasks folder.

    params:
    :param argsdict: provides arguments from input file or CLI such as filter specs.
    :param tasklist_name: created by f"{BIF.name}_chunk_{'%4.4u' % (chunk_index)}.csv"
            tasklist is found in extaction_tasks folder.

    produces results/marks_{tasklist_name}

    This is the primary extraction function for lambda operation.
    
    PRIOR TO LAUNCHING THIS:
        Check availability of:
            styles/rois_map_df.csv      -- as a result of gentemplates, genrois, genmap
            styles/contests_dod.json    -- based on EIF
            

    """

    current_archive_basename = ''
    archive = None

    # set s3 vs local mode
    DB.set_DB_mode()        

    # initialize results.
    DB.BALLOT_MARKS_DF = pd.DataFrame()
    
    rois_map_df      = DB.load_data('styles', 'roismap.csv')
    contests_dod     = DB.load_data('styles', 'contests_dod.json')

    #extraction_tasks_df = DB.load_df_csv(name=tasklist_name, dirname='extraction_tasks', s3flag=argsdict['use_s3_results'])
    extraction_tasks_df = DB.load_data(dirname='marks', subdir='tasks', name=tasklist_name)

    #archives_folder_path = argsdict['archives_folder_path']

    for task_idx in range(len(extraction_tasks_df.index)):

        task_dict           = extraction_tasks_df.iloc[task_idx]
        ballot_id           = task_dict['ballot_id']
        precinct            = task_dict['precinct']
        archive_basename    = task_dict['archive_basename']

        """ has structure of BIF
            ('archive_basename', str),
            ('ballot_id', str),
            ('file_paths', str),    # note, may be semicolon separated list.
            ('cvr_file', str),
            ('precinct', str),
            ('party', str),
            ('style_num', str),
            ('card_code', str),
            ('ballot_type_id', str),
            ('sheet0', 'Int32'),                 # 0, 1 ...
            ('is_bmd', 'Int32'),
            ('style_roi_corrupted', 'Int32'),
            ('other_comments', str),
        """

        ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict)

        #ballot_id, vendor='ES&S', precinct=None, party=None, group=None, extension=None, file_paths=[]):
        # this call does nothing more than initialize the instance data
        ballot = Ballot(argsdict, 
            file_paths = re.split(r';', task_dict['file_paths']), 
            ballot_id=ballot_id, 
            precinct=precinct, 
            archive_basename=archive_basename)

        ballot.ballotdict['is_bmd'] = bool(utils.set_default_int(task_dict.get('is_bmd', 0), 0))

        if (ballot.ballotdict['is_bmd'] and not argsdict['include_bmd_ballot_type'] or
            not ballot.ballotdict['is_bmd'] and not argsdict['include_nonbmd_ballot_type']):

            utils.exception_report(f"Tasklist says is_bmd is {ballot.ballotdict['is_bmd']} "
                "but argsdict does not include that type. Extract tasklists may be stale")
            continue

        if archive_basename != current_archive_basename:
            if current_archive_basename and archive:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            string = f"EXCEPTION: Could not load source files from archive {archive_basename} offset {task_idx} for ballot_id: {ballot_id} Precinct: {precinct}"
            utils.exception_report(string)
            continue

        utils.sts(f"\n{'-'*50}\nProcessing tasklist:{tasklist_name} offset: {task_idx} ballot_id:{ballot_id}", 3)

        ballot.get_ballot_images()      # this reads images from PDFs

        #-----------------------------------------------------
        # this is the primary function call, performed for each ballot,
        # and producing a marks_df for this ballot, with one record for
        # each option.
        
        ballot_marks_df = extract_vote_from_ballot(
            argsdict, ballot, rois_map_df, contests_dod,
            ballot_style_overrides_dict,
            )
            
        # the above function makes exception reports if:
        #   1. the style cannot be read from the ballot, alignment or barcode error.
        #   2. the style failed to map.
        #-----------------------------------------------------

        if ballot_marks_df is None or not len(ballot_marks_df.index):
            continue    # not successful and exception has already been logged.

        DB.BALLOT_MARKS_DF = DB.BALLOT_MARKS_DF.append(ballot_marks_df, sort=False, ignore_index=True)
        continue

    #DB.save_df_csv(name=tasklist_name, dirname='marks', df=DB.BALLOT_MARKS_DF)
    DB.save_data(data_item=DB.BALLOT_MARKS_DF, dirname='marks', subdir='chunks', name=f"marks_{tasklist_name}")