def parse_boulder_csv(file, options): """Parse Boulder Audit CSV file. It has one line per contest per batch. A sample is in testdata/test-boulder-csv.txt The first line is a header line like this, including columns for each choice. The first valid choice for a contest is deemed to be the 3rd entry The last valid choice for a contest is always 'Under Votes'. Entries named "Cast Votes" or "Blank" are skipped. "MBB Name","Contest Name","Contest Ballots","YES","NO","Cast Votes","Over Votes","Under Votes",,,,,,,, """ election = options.election reader = csv.DictReader(open(file), delimiter=",") au = util.AuditUnit() for r in reader: batch = [r['MBB Name'] + options.batchid] contest = r['Contest Name'] ballots = r['Contest Ballots'] #TODO: If this is a primary, see how to get party information if options.contest != None and options.contest != contest: continue for choice in reader.fieldnames[3:]: # Skip the "Cast Votes" column, in among the subtotals if choice in ["Cast Votes", "Blank"]: continue # Use standard names for votes that are Under or Over db_choice = choice if db_choice == "Under Votes": db_choice = "Under" if db_choice == "Over Votes": db_choice = "Over" # If the batch or contest has changed, push out the previous units if batch != au.batches or contest != au.contest: logging.debug("now batch '%s' contest '%s' at line %d" % (batch, contest, reader.reader.line_num)) util.pushAuditUnit(au, min_ballots=options.min_ballots) au = util.AuditUnit( election, contest, 'U', batch, ballots) # FIXME: try to extract type from batch name au.update(db_choice, r[choice]) """ if choice == 'Under Votes': break """ util.pushAuditUnit(au, min_ballots=options.min_ballots)
def parse_sequoia(file, options): """Parse Sequoia precinct results in "text with headers" format: a tab-separated .txt file. It has one line per candidate per contest per precinct. The model of this format is the Denver 2008 data sample in "testdata/test-sequoia-precinct.txt". If the data is to be aggregated for privacy (the default), the data should be sorted by batch (precinct). The first line has the column headers: PRECINCT_NAME CANDIDATE_FULL_NAME contest_party_id candidate_party_id CONTEST_TYPE contest_id CONTEST_ORDER CANDIDATE_ORDER CONTEST_FULL_NAME TOTAL PRECINCT_ID precinct_order contest_vote_for PROCESSED_DONE PROCESSED_STARTED CONTEST_TOTAL IS_WRITEIN undervote overvote Question - can separate Absentee, Early and In-precinct count be generated? """ election = options.election reader = csv.DictReader(open(file), delimiter="\t") au_AB = util.AuditUnit() for r in reader: batch = [r['PRECINCT_NAME'] + options.batchid] contest = r['CONTEST_FULL_NAME'] #TODO: If this is a primary, see how to get party information #if r['Party_Code']: # contest += ":" + r['Party_Code'] if options.contest != None and options.contest != contest: continue choice = r['CANDIDATE_FULL_NAME'] choice = choice.strip() # If the batch or contest has changed, push out the previous units if batch != au_AB.batches or contest != au_AB.contest: logging.debug("now batch '%s' contest '%s' at line %d" % (batch, contest, reader.reader.line_num)) util.pushAuditUnit(au_AB, min_ballots=options.min_ballots) au_AB = util.AuditUnit(election, contest, 'AB', batch) au_AB.update(choice, r['TOTAL']) if r['CANDIDATE_ORDER'] == '1': # duplicated for each candidate - silly au_AB.update('Under', r['undervote']) au_AB.update('Over', r['overvote'])
def parse_swdb_csv(file, options): """Parse csv dump of California Statewide Database (SWDB) - a comma-separated .csv file. If the data is to be aggregated for privacy (the default), the data should be sorted by batch (precinct). Question - can separate Absentee, Early and In-precinct count be generated? """ election = options.election reader = csv.DictReader(open(file), delimiter=",") au = util.AuditUnit() for r in reader: batch = [r['SVPREC_KEY'] + options.batchid] contest = "Congress%.2d" % int(r['CDDIST']) ballots = r['TOTVOTE'] #TODO: If this is a primary, see how to get party information #if r['Party_Code']: # contest += ":" + r['Party_Code'] if options.contest != None and options.contest != contest: continue for choice in [ 'CNGDEM', 'CNGGRN', 'CNGREP', 'CNGLIB', 'CNGPAF', 'CNGAIP' ]: # If the batch or contest has changed, push out the previous units if batch != au.batches or contest != au.contest: logging.debug("now batch '%s' contest '%s' at line %d" % (batch, contest, reader.reader.line_num)) util.pushAuditUnit(au, min_ballots=options.min_ballots) au = util.AuditUnit(election, contest, 'U', batch, ballots) au.update(choice, r[choice]) # Undervotes and Overvotes are not included in the input, but their sum is # implicit in the number of ballots minus the votes for candidates. # So make contest_ballots() work by assuming that all the unaccounted for ballots are undervoted. au.update("Under", str(int(ballots) - au.contest_ballots())) util.pushAuditUnit(au, min_ballots=options.min_ballots)
def parse_hart_csv(file, options): """Parse a csv file of election data. The model of this format is a Hart precinct spreadsheet from Orange County: testdata/test-orange-hart.csv or http://www.sos.ca.gov/elections/sov/2009-special/precinct-data/data/orange-20090519.csv """ election = options.election reader = csv.DictReader(open(file)) au_AB = util.AuditUnit() au_EV = util.AuditUnit() au_ED = util.AuditUnit() for r in reader: batch = [r['Precinct Name'] + options.batchid] contest = r['Contest Title'] if r['Contest Party']: contest += ":" + r['Contest Party'] if options.contest != None and options.contest != contest: continue choice = r['Candidate Name'] choice = choice.strip() # Do a bit of normalization - replace multiple spaces with just one while choice.find(" ") != -1: choice = choice.replace(" ", " ") # If the batch or contest has changed, push out the previous units if batch != au_AB.batches or contest != au_AB.contest: AB_ballots = r['Absentee Mail Ballots'] EV_ballots = r['Absentee Walk-in Ballots'] ED_ballots = r['Election Ballots'] logging.debug("now batch '%s' contest '%s' at line %d" % (batch, contest, reader.reader.line_num)) util.pushAuditUnit(au_AB, min_ballots=options.min_ballots) au_AB = util.AuditUnit(election, contest, 'AB', batch, AB_ballots) util.pushAuditUnit(au_EV, min_ballots=options.min_ballots) au_EV = util.AuditUnit(election, contest, 'EV', batch, EV_ballots) util.pushAuditUnit(au_ED, min_ballots=options.min_ballots) au_ED = util.AuditUnit(election, contest, 'ED', batch, ED_ballots) au_AB.update(choice, r['Absentee Mail Votes']) au_EV.update(choice, r['Absentee Walk-in Votes']) au_ED.update(choice, r['Election Votes']) if r['Candidate Seq Nbr'] == '1': # duplicated for each candidate au_AB.update('Under', r['Absentee Mail Blank Votes']) au_AB.update('Over', r['Absentee Mail Over Votes']) au_EV.update('Under', r['Absentee Walk-in Blank Votes']) au_EV.update('Over', r['Absentee Walk-in Over Votes']) au_ED.update('Under', r['Election Blank Votes']) au_ED.update('Over', r['Election Over Votes'])
def parse_csv(file, options): """Parse a csv file of election data. It has one line per candidate per contest per precinct. The model of this format is the San Mateo precinct spreadsheet in "testdata/test-san-mateo-dp-92-p.csv". If the data is to be aggregated for privacy (the default), the data should be sorted by batch (precinct). """ election = options.election reader = csv.DictReader(open(file)) au_AB = util.AuditUnit() au_EV = util.AuditUnit() au_ED = util.AuditUnit() for r in reader: batch = [r['Precinct_name'] + options.batchid] contest = r['Contest_title'] if r['Party_Code']: contest += ":" + r['Party_Code'] if options.contest != None and options.contest != contest: continue choice = r['candidate_name'] choice = choice.strip() # Do a bit of normalization - replace multiple spaces with just one while choice.find(" ") != -1: choice = choice.replace(" ", " ") # If the batch or contest has changed, push out the previous units if batch != au_AB.batches or contest != au_AB.contest: logging.debug("now batch '%s' contest '%s' at line %d" % (batch, contest, reader.reader.line_num)) util.pushAuditUnit(au_AB, min_ballots=options.min_ballots) au_AB = util.AuditUnit(election, contest, 'AB', batch) util.pushAuditUnit(au_EV, min_ballots=options.min_ballots) au_EV = util.AuditUnit(election, contest, 'EV', batch) util.pushAuditUnit(au_ED, min_ballots=options.min_ballots) au_ED = util.AuditUnit(election, contest, 'ED', batch) au_AB.update(choice, r['absentee_votes']) au_EV.update(choice, r['early_votes']) au_ED.update(choice, r['election_votes']) if r['cand_seq_nbr'] == '1': # duplicated for each candidate - silly au_AB.update('Under', r['absentee_under_votes']) au_AB.update('Over', r['absentee_over_votes']) au_EV.update('Under', r['early_under_votes']) au_EV.update('Over', r['early_over_votes']) au_ED.update('Under', r['election_under_votes']) au_ED.update('Over', r['election_over_votes'])
def parse_xml_crystal(file, options): """Extract relevant data from each contest in a given crystalreports xml file""" import lxml.etree as ET election = options.election if os.path.basename(file) == "cumulative.xml": # if it's the borind default, use alternate naming scheme: # parent directory of canonical path batch = os.path.basename(os.path.dirname(os.path.realpath(file))) else: batch = os.path.basename(file)[0:-4] # trim directory and ".xml" # filter out this confounding unprefixed namespace attribute # ...or figure out how to parse it... filterout = "xmlns = 'urn:crystal-reports:schemas'" import StringIO newfile = StringIO.StringIO() newfile.write(open(file).read().replace(filterout, "")) newfile.seek(0) root = ET.parse(newfile).getroot() logging.debug("root = %s" % root) # The Hart system forces the use of some odd contest names. # This is a table of fixes for what Boulder needed in the 2008 general replacements = [ (", Vote For 1", ""), ("THE EARNINGS FROM THE INVESTMENT", "ST VRAIN VALLEY SCHOOL DISTRICT NO. RE-1J BALLOT ISSUE NO. 3B"), ("BALLOT ITEM REMOVED ", ""), ] values = {} for contesttree in root.xpath('//FormattedAreaPair[@Type="Group"]'): tree = contesttree.xpath( 'FormattedArea[@Type="Header"]//FormattedReportObject[@FieldName="{@district_info}"]/FormattedValue' ) if len(tree) != 1: logging.error( "Error: number of Headers should be 1, not %d. Line %d" % (len(tree), contesttree.sourceline)) logging.debug(ET.tostring(contesttree, pretty_print=True)) contest_name = tree[0].text while contest_name.find(" ") != -1: contest_name = contest_name.replace(" ", " ") for old, new in replacements: contest_name = contest_name.replace(old, new) # NOTE: this may not work in primary, if there are multiple # contests with the same name per election, one for each party contest = contest_name.strip() if options.contest != None and options.contest != contest: logging.debug("skipping %s" % (contest)) continue au_AB = util.AuditUnit(election, contest, "AB", [batch]) au_EV = util.AuditUnit(election, contest, "EV", [batch]) au_ED = util.AuditUnit(election, contest, "ED", [batch]) logging.debug("Contest: %s (%s)" % (contest, tree[0].text)) """ We don't need the Header: we get contest from the node itself tree_head = extract_values(contesttree.xpath( 'FormattedArea[@Type="Header"]' ), fields ) or maybe look at just '{@district_info}': 'Contest', if tree_head['Contest'] != tree[0].text: print "head = ", tree_head, " contest = ", tree[0].text """ #logging.debug("tree:\n" + ET.tostring(contesttree, pretty_print=True)) # Get undervotes and overvotes from Footer absenteer = extract_values( contesttree.xpath('FormattedArea[@Type="Footer"]'), { '{@_Combine_Under}': 'Under', # Report combined AB/Early here '{@AB_Under_votes}': 'Under', '{@_Combine_Over}': 'Over', # Report combined AB/Early here '{@AB_Over_Votes}': 'Over' }) if absenteer == {}: # E.g. "No Candidate for Race" continue try: au_AB.update('Under', absenteer['Under']) au_AB.update('Over', absenteer['Over']) except KeyError, key: print( "Parsing error in file %s\n line: %s\n KeyError Exception for key: '%s'\n contest: %s\n absenteer: %s\n tree:\n%s" % (file, contesttree.sourceline, key, contest, absenteer, ET.tostring(contesttree, pretty_print=True))) earlyr = extract_values( contesttree.xpath('FormattedArea[@Type="Footer"]'), { '{@EA_Under_Votes}': 'Under', '{@EA_Over_Votes}': 'Over' }) au_EV.update('Under', earlyr['Under']) au_EV.update('Over', earlyr['Over']) electionr = extract_values( contesttree.xpath('FormattedArea[@Type="Footer"]'), { '{sp_cumulative_rpt.c_under_votes_election}': 'Under', '{sp_cumulative_rpt.c_over_votes_election}': 'Over' }) au_ED.update('Under', electionr['Under']) au_ED.update('Over', electionr['Over']) #logging.debug(contesttree.getchildren()) parties = set() # For each candidate or option for c in contesttree.xpath('.//FormattedAreaPair[@Type="Details"]'): cv = extract_values( c, { '{@_Display_Candidate_Name}': 'Name', '{sp_cumulative_rpt.party}': 'Party', #'{@Tl_total_cand}': 'Election day', #?? '{sp_cumulative_rpt.c_votes_election}': 'Election day', #?? '{@_Combine_AB_EA}': 'Absentee', # report combined here '{@AB_Votes}': 'Absentee', '{@EA_Votes}': 'Early' }) choice = cv['Name'] choice = choice.strip() while choice.find(" ") != -1: choice = choice.replace(" ", " ") cv['Name'] = choice logging.debug("candidate: %s" % cv['Name']) au_AB.update(cv['Name'], cv['Absentee']) au_EV.update(cv['Name'], cv['Early']) au_ED.update(cv['Name'], cv['Election day']) parties.add(cv['Party']) assert len(parties) > 0 # or == 1 for primary? party = parties.pop() or "" key = "%s:%s" % (contest, party) values[key] = [au_AB, au_EV, au_ED]
def parse_swdb(file, options): """Parse swdb file. "file" can be a file, url, or string suitable for openAnything(). Also needs a source of the "codes" to annotate the choice names. """ one_contest_prefixes = ('PRS', 'SEN', 'PR_') dist_contest_prefixes = ('CNG', 'ASS') contest_prefixes = one_contest_prefixes + dist_contest_prefixes """ choices = {} totals = {} codes_name = "003.codes" codes = openanything.openAnything(codes_name) for l in codes: (code, choice, total) = l.rstrip().split('\t') if code.startswith(contest_prefixes): choices[code] = choice totals[code] = total elif code.endswith(('VOTE', 'REG', 'DIST')): # FIXME - deal with this later continue else: print "unrecognized code: %s in line %s" % (code, l) """ reader = Dbf(file) au = util.AuditUnit(options.election) #for r in reader: reader_iter = iter(reader) rec = 0 while True: try: r = reader[rec] except (IndexError, StopIteration): break except: import traceback traceback.print_exc(1) logging.error("Dbf error: %s\nrecord %d" % (r, rec)) rec = rec + 1 continue rec = rec + 1 #batch = r["SRPREC"] batch = r["SRPREC_KEY"] #batch = r["SVPREC"] #batch = r["SVPREC_KEY"] if batch.startswith('SOV') or batch.endswith('TOT'): continue # state-wide data marks absentee with trailing "A", # county data marks them with "_A" if batch.endswith('A'): type = "AB" if batch.endswith('_A'): batch = batch[0:-2] else: batch = batch[0:-1] else: type = "BA" addist = r['ADDIST'] cddist = r['CDDIST'] #sddist = r['SDDIST'] for code in reader.fieldNames: if code.endswith(('PREC', 'VOTE', 'REG', 'DIST', 'SVPREC_KEY')): continue code_full = code contest = code[:3] if code.startswith('ASS'): code_full = code[:3] + ("%02d" % addist) + code[-3:] contest = code_full[:5] elif code.startswith('CNG'): code_full = code[:3] + ("%02d" % cddist) + code[-3:] contest = code_full[:5] elif code.startswith('PR_'): contest = code[:-1] else: contest = code[:3] if options.contest != None and options.contest != contest: continue # until we fully figure out how to get the district numbers... # contest = contests[code] try: au = util.AuditUnit(options.election, contest, type, [batch]) au.update(code_full[len(contest):], str(r[code])) util.pushAuditUnit(au, min_ballots = options.min_ballots) except: print "Error looking up code %s (%s) for %s-%s" % (code, code_full, batch, type) continue