def get_and_merge_s3_logs(dirname, rootname='log', chunk_pat=None, subdir=None):
    """
    Fetches all lambda logs from a job folder on S3 that meet rootname, chunk_pat.
    combine into one file, write it to dirname/{rootname}_{dirname}.txt
    :param logs_folder: an S3 folder to fetch lambda logs from
    :return:
    
    log file name: f"log_{group_root}_{dirname}_chunk_{str(chunk_idx)}.txt"
    """
    utils.sts(f"Getting the {rootname} files from s3 and combining")
    
    # download all the log files
    # make sure tmp is empty.
    tmp_dirpath = DB.dirpath_from_dirname('tmp')
    shutil.rmtree(tmp_dirpath, ignore_errors=True)
    
    sts(f"Downloading all {rootname} files, one per chunk", 3)
    # download according to matching pattern
    DB.download_entire_dirname(dirname=dirname, subdir=subdir, file_pat=fr"{rootname}_{chunk_pat}\.txt", local_dirname='tmp')
    
    sts(f"Combining {rootname} files", 3)
    dest_name = f"{rootname}_{dirname}.txt"
    dest_dirpath = DB.dirpath_from_dirname(dirname=dirname, s3flag=False)
    combined_log_filepath = dest_dirpath + dest_name

    num_files = merge_txt_dirname(dirname='tmp', subdir=subdir, destpath=combined_log_filepath, file_pat=f"{rootname}_*.txt")
    
    sts(f"Writing combined {rootname} file: {combined_log_filepath} to s3 in dirname:'{dirname}'", 3)
    if os.path.exists(combined_log_filepath):
        DB.upload_file_dirname(dirname, dest_name, local_dirname='tmp')
    return num_files
Пример #2
0
def main():
    utils.show_logo()
    print(  f"\n\n{'=' * 50}")

    argsdict = args.get_args()          # parses input_file as specifed in CLI using arg_specs.csv
    args.argsdict = argsdict
    
    print("argsdict:")
    print(pprint.pformat(argsdict))

    print(  f"\n\n{'=' * 50}")

    if (argsdict.get('self_test')):
        self_test.self_test(argsdict)


    """ The paths of archives is normalized to allow the archives to be either local or on s3.
        'archives_folder_path' -- path to folder on local system.
        'archives_folder_s3path' -- s3path to folder on s3
        'source' list are basenames, without path, but including extension.
        
    """


    # if argsdict['archives_folder_path'] and not argsdict['source']:
        # # create a list of source archives in the source folder.
        # srcdict = {}
        # dirdict = utils.get_dirdict(argsdict['archives_folder_path'], '.zip')
        # for name, path in dirdict.items():

            # if (name in argsdict['exclude_archives'] or
                # argsdict['include_archives'] and not name in argsdict['include_archives']):
                # continue
            # srcdict[name] = path

        # argsdict['source'] = list(srcdict.values())
        # argsdict['srcdict'] = srcdict
        # utils.sts(f"input directive 'source' resolved to: {argsdict['source']}", 3)

    op = argsdict.get('op', 'all').lower()
    
    DB.set_DB_mode()
    
    """ =======================================================================
        PRIMARY API ENTRY POINTS
        
        Each one of the following relies on a job file which provides the settings
        as parameter,value in csv file, where comments are allowed preceded by #.
        Thus the api must provide 
            -i path             location of settings file -- could be file on s3.
            -op operation       string like 'genbif_from_cvr'
            
        Each function produces:
            log.txt                 appends extensive status reports.
            exception_report.txt    appends each exception encountered. 
                                        exceptions to processing and not python exceptions, per se.
                                        
            as well as other files, noted below.
            
        Initial implementation will include one major intry point with operation selection as follows:
            'genbif_from_cvr'           (Fast)
            'genbif_from_ballots'       (Slow)
            'create_bif_report'         (Fast)
            'gentemplates'              (Slow)
            'genmaprois'                (Somewhat slow)
            'extractvote'               (Very slow)
            'genreport'                 (fast)
            'cmpcvr_and_report'         (somewhat slow)
            'get_status'                (fast) - return status of slow functions.    
                op='get_status' ref='function'
                    where function = one of 'genbif_from_ballots', 'gentemplates', 'genmaprois', 'extractvote'
            
        In the functions below, argsdict is established from the settings file.
        
    """

    if op == 'copy_config_files_to_s3':
        """ This function will copy local config files in EIFs to s3, to simulate
            interaction with the frontend website, which will upload and place files
            s3://us-east-1-audit-engine-jobs/{job_name}/config/ 
            
            Files to be placed there:
                JOB settings file
                EIF file
                BOF file
                manual_styles_to_contests
                style_lookup_table
                
            In local mode running these are in either EIFs/ or input_files/ in repo folder.
                
        """
        DB.upload_file_dirname('config', argsdict['eif'])
        DB.upload_file_dirname('config', argsdict['bof'])
        DB.upload_file_dirname('config', argsdict['manual_styles_to_contests_filename'])
        DB.upload_file_dirname('config', argsdict['style_lookup_table_filename'])
        DB.upload_file_dirname('config', argsdict['input'], local_dirname='input_files')
            
        
        
        
    elif op == 'precheck_job_files':
        """ This function simply does a precheck of the job files that exist
            in the config folder for this job on s3.
        """
        pass
    
    
    
    
    
    
    elif op == 'genbif_from_cvr':
        """ 
        If CVR file(s) are provided with style information included, 
        this operation builds "ballot information file" BIF data by reviewing the CVR
        May also use path information of ballots in archives for precincts, groups, party.
        For Dominion, scan CVR JSON chunks and fill in info about ballots.
        Creates one .csv file for each archive in folder bif.
        This is a relatively fast operation that can be completed typically in a matter of seconds
        Result:
            BIF data file ready for BIF report.
            log
            exception report
        """
        genbif_from_cvr(argsdict)


    elif op == 'genbif_from_ballots':
        """ 
        If no CVR is available, we must scan the ballots to generate the bif.
        Each ballot is reviewed and style information is read from the ballots.
        May also use path information of ballots in archives for precincts, groups, party.
        This can be done by lambdas and should complete within minutes but
        typically will not complete during a single REST post/response.
        Result:
            BIF ready to produce BIF report.
            separate folder for each failing ballot to allow investigation.
            log
            exception report
        """
        genbif_from_ballots(argsdict)
        
    # elif op == 'get_status':
        # """ This function provides status operation in terms of % complete.
        # """
        # if ref == 'genbif_from_ballots':
            # return get_status_genbif_from_ballots(argsdict)
        # elif ref == 'gentemplates':
            # return get_status_gentemplates(argsdict)
        # elif ref == 'genmaprois':
            # return get_status_genmaprois(argsdict)
        # elif ref == 'extractvote':
            # return get_status_extractvote(argsdict)
        # else:
            # utils.sts(f"ref '{ref}' not supported by op=get_status", 3)

    elif op == 'create_bif_report':
        """ 
        as a result of validate_bifs or genbif_from_ballots, this report is 
        generated, or it can be generated once the BIF is built. Report provides:
            Number of Ballot Archives
            Total number of BIF records
            Unique ballot_ids
            Duplicate ballot_ids
            Number of CVR files
            Number of precincts
            Number of parties
            Number of style_nums
            Number of card_codes
            Number of ballots w/o card_codes
            Number of BMD ballots
            Number of corrupted ballots (could not be read)
            Number of different sheets
            Number of each sheet
        
        This operation completes quickly and currently produces a text report to console.
        Can provide alternative data output as JSON or HTML through command line switch.
            
        """
        create_bif_report(argsdict)
        
    elif op == 'build_template_tasklists':
        """ 
        Scan bifs and generate template tasklists, with one tasklist csv file per style.
        tasklist is the same format as bif but should not be updated with any information.
        This generally not used as REST entry point.
        """
        build_template_tasklists(argsdict)

    elif op == 'gentemplates':
        """ this function requires that BIF data is available. Used as REST entry point.
            1. generates template tasklists
            2. contructs templates by combining usually 50 ballots to improve resolution.
            Result is a set of raw templates (PNG files), one for each style,
            and possibly also checkpoint images including the components (up to 50).
            
            This function takes significant time, of more than a minute per style. 
            However, this can be delegated to lambdas and may be completed 
            in (# styles/1000) * time per style, but still too long for single REST POST.
            For Dane County, WI, with 191 styles, it still takes at least a minute.
            If all 10,000 styles are used in SF, time is 10 minutes.
            
            Log file updated.
            Report generated of result.
            PNG files for review of each style.
        """
        if argsdict['include_gentemplate_tasks']:    # sub tasks in gentemplate action - generate base templates
            build_template_tasklists(argsdict)
            
        gentemplates_by_tasklists(argsdict)

    elif op == 'gentemplates_only':
        """ This function used for debugging only when tasklists are already generated.
            Tasklists take only seconds to complete now.
            NOT USED IN REST API
        """
        gentemplates_by_tasklists(argsdict)

    elif op == 'genrois':
        """
        After templates are generated, each style is image-analyzed and then OCR'd.
        Result is set of PNG images providing regions of interest (ROIs) determined.
        Style templates must be generated at this point to allow further analysis and generation of rois
        The json list of rois and the image for each result.
        
        Result:
            Creates a report of rois generated
            PNG image files with graphic outlines of rois that can be reviewed by the user.
        """
        genrois(argsdict)

    elif op == 'maprois':
        """
        Once Rois are generated, they can be fairly quickly mapped to contests and options based on information
        in the EIF - Election Information File. This operates at the rate of several seconds per style.
        Result is 
            PNG "redlines" showing the mapping of contests and options to each style.
            Map report, providing detail of where mapping may have gotten off track.
            Log.
        """
        maprois(argsdict)

    elif op == 'genmaprois':
        """ 
        Major REST entry point.
        This the most typical operation once templates have been generated, which may take
        time and use compute resources. May need to be done repetitively while operator makes
        changes to settings file. Operator must review the map report and redlines.
        Once review is completed, then extraction can commence.
        Can break this up for processing by lambdas but it is so fast now that it may not be necessary.
        Result is:
            PNG images showing ROIS from genrois
            PNG redlines showing the correspondence of contests and options for each style.
            failures copied to assist folder
            Map Report
            Log
        """
    
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'get_assist_requests':
        """ 
        After genmaprois is completed, some styles may need manual assistance by human operator.
        This is used in graphic-mode dominant rois generation rather than OCR dominant generation.
        Front end first requests assist requests, and the response is
            list of ballot_ids which needs assistance.
            path to each template file
            path to existing json file for that template.
            
        NOTE this is a new function which is not implemented yet.
        """
        pass
        
    elif op == 'write_new_assist_annotation':
        """ The front end will implement functionality like is implemented by 
            tools/template_edit.py, to allow the user to add rectangular regions,
            horizontal and vertical lines, to the image.
            Then, this writes a new JSON annodation file.
            Maybe this does not need to be provided if frontend can write to s3 directly.
        
        NOTE this is a new function which is not implemented yet, but is implemented
            for CLI operation as 'template_edit' using tools/template_edit.py
        """
        pass
        
    elif op == 'build_extraction_tasks':
        """ Scan bifs and generate extraction tasklists, with an appropriate number of ballots for each lambda.
            tasklist is the same format as bif and should not be updated with any information by lambda.
            This function completes rapidly and thus is combined with actual extraction.
        """
        build_extraction_tasks(argsdict)

    elif op == 'extractvote_only':
        """ with extraction tasklists already built, go through all the ballots in the 
            archives and extract the marks into single csv data table for each tasklist, 
            and then combine into a single csv file for each archive.
            Each tasklist is delegated to a separate lambda process.
            Each lambda can take up to 15 minutes to process one tasklist. Total time of this
            process is less than (# ballots / 200,000) * 15 minutes.
            So for a county like SF, with 500K ballots, upper limit is about 35 minutes.
            LA, the largest county in the US has about 6 million ballots, upper limit is 7.5 hours.
        """
        extractvote_by_tasklists(argsdict)
        #extractvote(argsdict)

    elif op == 'extractvote':
        """ Build extraction tasklists and then extract vote 
            Perform both the tasklist generation (fast) and extraction (slow) above.
            This is the normal REST entry point.
            Result is 
                marks_df.csv for each archive.
                Extraction Report
                Log
                Exception Report
        """
        # go through all the ballots in the archives and extract the marks into single json file for each archive
        build_extraction_tasks(argsdict)
        extractvote_by_tasklists(argsdict)

    elif op == 'genreport':
        """
        Once extraction is completed, a report of results can be produced independent of the voting 
        system results, or CVR. Can be compared with high-level election results.
        
        Result:
            summary of the election results per audit system.
            Includes total number of ballots:
                not processed by audit system due to misalignment or other corruption.
                not provided in archives.
            Compares with high-level election result.
            
        """
        genreport(argsdict)

    elif op == 'cmpcvr':
        """ If a CVR is available and the voting system evaluation of each ballot
            is provided, then this function compares the audit system result with
            the voting system cvr and provides a comprehensive result.
            This function processes each marks_df.csv that corresponds to each archive, and
            compares each record with CVR, which is fully combined into one data file by this
            function.
            Result:
                cmpresult_n.csv for each archive n processed.
                This file is not combined to a single report.
        """
        cmpcvr_by_tasklists(argsdict)

    elif op == 'gen_cmpcvr_report':
        """ 
        The result of cmpcvr is on an archive-by-archive basis and compares
        the combined CVR, which is generally not organized by archive, with the 
        marks_df.csv which are organized by archive. Creates a ballot-by-ballot
        comparison result on per-archive basis as csv file. Includes any 
        adjudications in the determination of discrepancies.
        Result:
            comprehensive report of the comparison, as JSON or text.
            JSON discrepancy list reduced to just the discrepancies.
            
        """
        generate_cmpcvr_report(argsdict)
        
    elif op == 'cmpcvr_and_report':
        """
        This is a major REST entry point.
        compares the CVR and creates a report by combining the above two functions.
        """
        cmpcvr_by_tasklists(argsdict)
        generate_cmpcvr_report(argsdict)
       
        
    elif op == 'get_discrepancy_list':
        """ new function for front end. After cmpcvr is completed, a full report is created. 
            This provides just the discrepancies to allow for adjudication in frontend UI,
            and the existing adjudication JSON file.
            This is a new function.
            Result:
                JSON list of discrepancies
                log updated.
            NOTE: THIS IS A NEW FUNCTION
        """
        pass
        
    elif op == 'submit_adjudications':
        """ front end will implement a review of all discrepancies and provides
            a DRE-like entry of votes as determined by review of ballot images
            This is a new function.
            Perhaps front end updates the adjudication file but this function 
            may be better so the action is properly logged.
            Results:
                status
                log updated.
            NOTE: THIS IS A NEW FUNCTION
        """
        pass

    # =============================================================================
    #    Updates the lambdas functions.
    # =============================================================================
    
    elif op == 'update_lambda' or op == 'update_lambdas':

        branch = argsdict.get('update_branch', 's3-and-lambdas-dev')

        """ to run this function, you must first delete the tree 'lambda_deploytment'
            including the folder.
        """
        
        function_name = argsdict.get('lambda_function', 'all')
        if function_name == 'all':
           update_lambda(update_all=True, branch=branch)
        else:
            update_lambda(function_name=function_name, branch=branch)

    # =============================================================================
    #    Additional operations only used for development and CLI operation.
    # =============================================================================
    
    elif op == 'post_gentemplate_cleanup':
        post_gentemplate_cleanup(argsdict)
    
    # elif op == 'combine_bif_chunks':
        # """ used for testing combining bif chunks
        # """
        # utils.combine_dirname_chunks_each_archive(argsdict, dirname='bif')
        
        
    elif op == 'get_manual_styles_to_contests':
    
        logs.sts("Processing manual_styles_to_contests", 3)
        style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)
        
        logs.sts(f"style_to_contests_dol:\n {pprint.pformat(style_to_contests_dol)}")

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')


    elif op == 'web2eif':
        """
        This operation scrapes from a url provided a high-level report of results.
        It was thought at the time that this report would provide unique contest names
        and consistent option names, but even though they were shorter and a bit better
        than the CVR, they also were insufficient for our needs. Thus, althought this
        does provide a basic function, it is not up to date with the current EIF format
        and does not eliminate the need for the EIF and manual editing.
        RESEARCH ONLY.
        """
        web_scraper.run_scraper(url=argsdict['url'])
        sys.exit()

    #elif op == 'tidycvr':
    #    """ This operation converts and ES&S cvr to tidy format
    #    Although it is operational, it was found that the existing ES&S format was
    #    a reasonably consice and useful format and we would work with it.
    #    """
    #    tidy_ess_cvr(argsdict)
    #    sys.exit()

    elif op == 'cvr2styles':
        """
        DEPRECATED. Use validate_bifs or genbif_from_ballots
        This operation preprocesses an ES&S CVR file or multiple Dominion CVR files.
        creates two dicts:
        styles_dict, which provides contest list for each style_num
        ballotid_to_style dict, which provides style_num based on ballotid.
        This currently only works if the CVR has a column providding the style named 'Ballot Style'
        Would need a different approach if no Ballot Style column is provided, such as
            creating a logical style iD, perhaps bitstring of contests, and use that as a logcal style identifier.
            This would not match to any style designator on the ballot.
        Proceses multple CVR files one at a time. (scalable)

        convert_cvr_to_styles function is in styles_from_cvr_converter.py
        for dominion, get_styles_to_contests_dominion is in gentemplate.py
        """
        convert_cvr_to_styles(argsdict)

    elif op == 'gentrm':
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'tltrm':
        build_template_tasklists(argsdict)
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    elif op == 'alltemplates':
        """
        Perform all the steps to creation of templates
        """
        genbif_from_cvr(argsdict)
        build_template_tasklists(argsdict)
#        convert_cvr_to_styles(argsdict)
        gentemplates_by_tasklists(argsdict)
        genrois(argsdict)
        maprois(argsdict)

    # elif op == 'download_results':
        # # download all results from s3 bucket.
        # s3utils.download_entire_dirname(argsdict, dirname='marks')
        # s3utils.get_and_merge_lambda_logs(argsdict)

    elif op == 'download_gentemplates':
        # download all gentemplates from s3 bucket.
        # NOT UPDATED TO NEW FILE STRUCTURE
        DB.download_entire_dirname(dirname='styles')
        #DB.download_entire_dirname(dirname='styles')

    elif op == 'delete_s3_results':
        # delete all results on s3 bucket.
        DB.delete_s3_results(argsdict)

    elif op == 'merge_results':
        """ merge results into single csv file.
        """
        utils.merge_results()

    elif op == 'check_extraction':
        check_extraction(argsdict)

    elif op == 'extractcmp':
        build_extraction_tasks(argsdict)
        extractvote_by_tasklists(argsdict)
        cmpcvr_by_tasklists(argsdict)
  
    # elif op == 'getlogs':
        # DB.get_and_merge_s3_logs()

    elif op == 'plotmetrics':
        plotmetrics()

    elif op == 'evalmarks':
        evalmarks()

    elif op == 'save_failing_ballots':
        # given list of ballots in inputfile, copy the original ballot image files
        # to (jobname)/styles/(ballot_id) folders
        
        # this function
        #   1. builds single bif table.
        #   2. looks each ballot up.
        #   3. using entry, opens the indicated archive and extracts the original file.
        #   4. saves the file in folder of jobname and ballot_id in styles, see above.
        save_failing_ballots(argsdict)

    elif op == 'reprocess_failing_ballots':
    
        reprocess_failing_ballots(argsdict)


    else:
        print("op value not defined ", op)
        sys.exit()