class ElectionsTableBuilder(object): def __init__(self, host, user, database): self.db = SQLConnection(host=host, user=user, db=database) def build(self): self.create_elections_table() years = utils.get_election_years(base_dir=BASE_DIR) self.populate_elections_table(years) def __del__(self): self.db.close() def create_elections_table(self): self.db.execute('DROP TABLE IF EXISTS elections;') self.db.execute(""" CREATE TABLE elections ( id INT NOT NULL AUTO_INCREMENT, level ENUM('federal', 'state'), chamber ENUM('house', 'senate', 'legislative_assembly', 'legislative_council'), election_date DATE, is_byelection BOOLEAN, PRIMARY KEY(id) ); """ ) def _populate_election_table_element(self, year, chamber): if chamber == 'house': txtfile = 'reps1.txt' elif chamber == 'senate': txtfile = 'senate1.txt' fname = BASE_DIR + year + '/' + year + txtfile if not os.path.isfile(fname): logging.info(' '.join(['No', chamber, 'results file in', year, ". Continuing.\n"])) return with open(fname, 'r') as f: headline = f.readline() election_day, election_month, election_year = headline.strip().split(' ')[-3:] if election_year == '1901': election_day = '29' election_date_str = ' '.join([election_day, election_month, election_year]) sql = """ INSERT INTO elections (level, chamber, election_date, is_byelection) VALUES ('%s', '%s', STR_TO_DATE('%s', '%%d %%M %%Y'), %d) """ % ('federal', chamber, election_date_str, 0) self.db.execute(sql) def populate_elections_table(self, election_years): for year in election_years: self._populate_election_table_element(year, 'house') self._populate_election_table_element(year, 'senate')
class CandidatesTableBuilder(object): def __init__(self, host, user, database): self.db = SQLConnection(host=host, user=user, db=database) self.base_dir = BASE_DIR + 'candidates/' def __del__(self): self.db.close() def build(self): self.create_candidates_tables() candidate_files = self.get_candidate_files() candidate_index_raw = self.get_candidate_index(candidate_files) candidate_index, cross_ref = self.parse_index_and_cross_reference(candidate_index_raw) self.write_candidates_table(candidate_index) def create_candidates_tables(self): self.db.execute('DROP TABLE IF EXISTS candidacies;') self.db.execute(""" CREATE TABLE candidacies ( id INT NOT NULL AUTO_INCREMENT, election_id INT, electorate_id INT, state_code VARCHAR(3), candidate_name_id INT, was_elected BOOLEAN, PRIMARY KEY(id) ); """ ) self.db.execute('DROP TABLE IF EXISTS candidate_names;') self.db.execute(""" CREATE TABLE candidate_names ( id INT NOT NULL AUTO_INCREMENT, candidate_name VARCHAR(50), PRIMARY KEY(id) ); """ ) def get_candidate_files(self): fnames = [] for root, _, files in os.walk(self.base_dir): [fnames.append(root + end) for end in files if 'intro.txt' not in end and '.shtml' not in end] return fnames def get_candidate_index(self, fnames): candidate_index_raw = [] for fname in fnames: with open(fname, 'r') as candidate_file: for line in candidate_file.readlines(): if line.startswith('>') or line.startswith('<'): candidate_index_raw.append(line.strip()) elif line.startswith(' '): candidate_index_raw[-1] += ' '+line.strip() return candidate_index_raw def _re_parse_index_entry(self, entry): index_content = list() g = re.search('>\s(.+, .+ \(.+, .+\)):\s(.+)|>\s(.+, .+):\s(.+)|>\s(.+, .+):|>\s(.+, .+),\s(.+)|>\s(.+):\s(.+)|>\s(.+, .+)\s(.+)', entry) for group in g.groups(): if group is not None: index_content.append(group) return {index_content[0]: index_content[1]} if len(index_content) == 2 else None def _re_parse_index_alias(self, entry): alias_content = list() g = re.search('<\s(.+): see (.+)|<\s(.+) \(see (.+)\)', entry) for group in g.groups(): if group is not None: alias_content.append(group) return {alias_content[0]: alias_content[1]} if len(alias_content) == 2 else None def _re_parse_mistaken_entry(self, entry): index_content = list() g = re.search('<\s(.+): (.+)', entry) for group in g.groups(): if group is not None: index_content.append(group) return {index_content[0]: index_content[1]} if len(index_content) == 2 else None def parse_index_and_cross_reference(self, candidate_index_raw): index = dict() cross_ref = dict() for entry in candidate_index_raw: if entry.startswith('>'): try: index.update(self._re_parse_index_entry(entry)) except: logging.warn('Cannot parse: ' + entry) elif entry.startswith('<') and len(re.findall('\d+', entry)) == 0: cross_ref.update(self._re_parse_index_alias(entry)) elif entry.startswith('<') and len(re.findall('\d+', entry)) != 0: index.update(self._re_parse_mistaken_entry(entry)) return index, cross_ref def write_candidates_table(self, candidate_index): states_short, states_long = zip(*self.db.fetch('SELECT code, state_name FROM states')) for candidate, elections in candidate_index.iteritems(): self.db.execute("""INSERT INTO candidate_names (candidate_name) VALUES ("%s")""" % candidate) for electorates in elections.split(','): electorate_years = electorates.split() state = [el for el in electorate_years if el.upper() in states_short] if len(state) == 1: state = state[0] state_ix = electorate_years.index(state) electorate = ' '.join(electorate_years[:state_ix]) state_code = state.upper() years = electorate_years[state_ix+1:] else: state = [state for state in states_long if state in electorates] if len(state) == 1: state = state[0] electorate, allyears = electorates.split(state) state_code = states_short[states_long.index(state)] years = allyears.split() if len(electorate) == 0: electorate = state else: try: state = 'NULL' state_code = 'NULL' electorate, allyears = electorates.split(' ',1) years = allyears.split() except: continue for year in years: if not (year.startswith('1') or year.startswith('2')): continue # Catch bad parsing if year.endswith('b') or year.endswith('b*'): continue # Exclude bielections safe_year = year[:4] was_elected = 1 if year.endswith('*') else 0 if electorate == 'Senate': election_id = utils.get_election_id(self.db, safe_year, 'senate') electorate_id = "NULL" else: election_id = utils.get_election_id(self.db, safe_year, 'house') electorate_id = utils.get_electorate_id(self.db, electorate, state_code, election_id) if election_id is not "NULL" else "NULL" if election_id is "NULL": logging.warn(candidate+' '+electorate+' '+state+' '+year) candidate_sql = """SELECT id FROM candidate_names WHERE candidate_name = "%s" """ % candidate candidate_name_id = utils.safe_id(self.db.fetch(candidate_sql)) insert_str = """INSERT INTO candidacies (election_id, electorate_id, state_code, candidate_name_id, was_elected) VALUES ({election_id}, {electorate_id}, '{state_code}', "{candidate_name_id}", {was_elected})""" insert_sql = insert_str.format(election_id=election_id, electorate_id=electorate_id, state_code=state_code, candidate_name_id=candidate_name_id, was_elected=was_elected) self.db.execute(insert_sql)
class ResultsTableBuilder(object): def __init__(self, host, user, database): self.db = SQLConnection(host=host, user=user, db=database) def build(self): self.create_results_table() states = utils.get_states() election_years = utils.get_election_years(base_dir=BASE_DIR) house_files = utils.get_election_files(BASE_DIR, election_years, 'house', states) for hfile in house_files: self.parse_results_data(hfile) def __del__(self): self.db.close() def _parse_counts_line(self, line): output = [] buff = '' while len(line) > 0: item = line.pop(0) if item.isdigit(): buff += item continue else: if len(buff) > 0: output.append(int(buff)) buff = '' continue return output def _to_int(self, string): return int(string.replace(',','').replace('.','')) def _get_party_dict(self, party_chunk): party_dictionary = {} for entry in party_chunk: if len(entry) == 0: continue elif entry[0] != '*': continue split_entry = re.split('\s+',entry) party_code = split_entry[1].upper() party_name = ' '.join(split_entry[3:]) party_dictionary[party_code] = party_name return party_dictionary def _get_electorate_info(self, chunk): headline = re.split('\s\s\s+', chunk[0]) if len(headline) == 1: return None, None, None electorate_name = headline[0].split(',')[0].title() electorate_counts = self._parse_counts_line(re.split('[\s,]',headline[1])) electorate_enrolled = electorate_counts[0] if len(electorate_counts) > 1: electorate_ballots = electorate_counts[1] else: electorate_ballots = "NULL" return electorate_name, electorate_enrolled, electorate_ballots def create_results_table(self): self.db.execute('DROP TABLE IF EXISTS results;') self.db.execute(""" CREATE TABLE results ( id INT NOT NULL AUTO_INCREMENT, election_id INT, electorate_id INT, candidate_name_id INT, candidacy_id INT, ballot_name VARCHAR(50), party_code VARCHAR(5), votes INT, pct DECIMAL(3,1), tpp_votes INT, tpp_pct DECIMAL(3,1), PRIMARY KEY(id) ); """ ) def _parse_ballot_counts(self, ballot_data): ballot_dict = dict() pct_exact_pattern = re.compile('^\d{2}\.\d$') delta_pct_exact_pattern = re.compile('^\({0,1}[\+\-\s]\d{2}\.\d\){0,1}$') pct_pattern = re.compile('\d{2}\.\d') delta_pct_pattern = re.compile('\({0,1}[\+\-\s]\d{2}\.\d\){0,1}') votes_pattern = re.compile('\d{0,3}\,{0,1}\d{1,3}') for entry in ballot_data: entry = entry.replace('\x92', "'").replace('\xb9',"'") entry_data = re.split('\s{2,}',entry) if len(entry) < 20 or len(entry_data) < 1 or ':' in entry or entry.startswith('>'): continue if entry_data[-1] == 'Unopposed': name = entry_data[0] namekey = name.translate(None,'*+').strip().title() ballot_dict.update({namekey: dict()}) last_name = name.translate(None,'*+').split()[-1].title() ballot_dict[namekey].update({'full_name': namekey.title()}) is_incumbent = 1 if name.endswith('*') or name.endswith('+') else 0 ballot_dict[namekey].update({'is_incumbent': is_incumbent}) ballot_dict[namekey].update({'is_elected': 1}) ballot_dict[namekey].update({'votes': 'NULL'}) ballot_dict[namekey].update({'tpp_votes': 'NULL'}) ballot_dict[namekey].update({'tpp_pct': 100}) ballot_dict[namekey].update({'pct': 100}) ballot_dict[namekey].update({'delta_pct': 'NULL'}) party_code = entry_data[1] if entry_data[1].isalpha() else 'NULL' ballot_dict[namekey].update({'party': party_code}) elif 'informal' in entry and votes_pattern.search(entry) is not None: namekey = 'Informal' ballot_dict.update({namekey: dict()}) ballot_dict[namekey].update({'tpp_votes': 'NULL'}) if 'unknown' in entry: ballot_dict[namekey].update({'votes': 'NULL'}) ballot_dict[namekey].update({'pct': 'NULL'}) else: votes = int(votes_pattern.match(entry).group().replace(',','')) pct = float(pct_pattern.search(entry).group()) ballot_dict[namekey].update({'votes': votes}) ballot_dict[namekey].update({'pct': pct}) elif (pct_exact_pattern.match(entry_data[-1]) is not None or delta_pct_exact_pattern.match(entry_data[-1]) is not None ): name = entry_data[0] if name.replace(',','').isdigit(): continue name_parts = name.translate(None,'*+').split() last_name = name.translate(None,'*+').split()[-1] if len(name_parts[0]) == 1: namekey = ' '.join(name_parts).title() if namekey in ballot_dict: namekey = ' '.join(name_parts).title() _ = name_parts.pop(0) else: namekey = last_name.title() else: namekey = last_name.title() if len(name_parts) > 1 or name == 'Strider': if namekey in ballot_dict: prev_name = copy.deepcopy(ballot_dict[namekey]['full_name']) if name_parts[0] != 'Hon': new_name = ' '.join([prev_name[0], prev_name.split()[-1]]) else: new_name = ' '.join([prev_name[1], prev_name.split()[-1]]) ballot_dict.update({new_name: copy.deepcopy(ballot_dict[namekey])}) del ballot_dict[namekey] namekey = ' '.join([name[0], namekey]) ballot_dict.update({namekey: dict()}) ballot_dict[namekey].update({'full_name': ' '.join(name_parts).title()}) is_incumbent = 1 if name.endswith('*') else 0 ballot_dict[namekey].update({'is_incumbent': is_incumbent}) is_elected = 1 if last_name == last_name.upper() else 0 ballot_dict[namekey].update({'is_elected': is_elected}) votes = int(votes_pattern.search(entry).group().replace(',','')) ballot_dict[namekey].update({'votes': votes}) ballot_dict[namekey].update({'tpp_votes': votes}) pct = float(pct_pattern.search(entry).group()) ballot_dict[namekey].update({'pct': pct}) party_code = entry_data[1] if entry_data[1].isalpha() else 'NULL' ballot_dict[namekey].update({'party': party_code}) try: delta_pct = float(delta_pct_pattern.search(entry).group().translate(None,'()')) except AttributeError: delta_pct = 'NULL' ballot_dict[namekey].update({'delta_pct': delta_pct}) else: pref_votes = int(votes_pattern.search(entry).group().replace(',','')) ballot_dict[namekey]['tpp_votes'] += pref_votes is_elected = 1 if last_name == last_name.upper() else 0 ballot_dict[namekey].update({'is_elected': is_elected}) if len(ballot_dict) >= 2: tpp_dict = dict() for k, v in ballot_dict.iteritems(): if k != 'Informal': tpp_dict.update({k: v['tpp_votes']}) top_tpp = sorted(tpp_dict.items(), key=lambda x: x[1], reverse=True)[:2] top_names, top_votes = zip(*top_tpp) vote_total = top_votes[0] + top_votes[1] for k in ballot_dict.iterkeys(): if k not in top_names: ballot_dict[k].update({'tpp_votes': "NULL"}) ballot_dict[k].update({'tpp_pct': "NULL"}) else: tpp_pct = float(tpp_dict[k]) / vote_total * 100 ballot_dict[k].update({'tpp_pct': tpp_pct }) return ballot_dict def parse_results_data(self, fileinfo): fname = fileinfo['fname'] election_id = utils.get_election_id(self.db, fileinfo['year'], fileinfo['chamber']) with open(fname, 'r') as f: lines = [line.strip() for line in f] breaks = [i for i,x in enumerate(lines) if '===' in x] electorate_data = [lines[breaks[i]-1:breaks[i+1]-1] for i in range(len(breaks[:-1]))] electorate_data.append(lines[breaks[-1]-1:]) party_dict = self._get_party_dict(electorate_data[0]) for electorate in electorate_data[3:]: electorate_name, _, _ = self._get_electorate_info(electorate) if electorate_name is None: continue electorate_id = utils.get_electorate_id(self.db, electorate_name, fileinfo['state'], election_id) logging.info(electorate_name + ' ' +fileinfo['year']) ballot_counts = self._parse_ballot_counts(electorate) logging.info(ballot_counts) for candidate in ballot_counts.iterkeys(): if candidate != 'Informal': ballot_name = ballot_counts[candidate]['full_name'] candidate_name_id = utils.get_candidate_name_id(self.db, ballot_counts[candidate]['full_name']) candidacy_id = utils.get_candidacy_id(self.db, election_id, electorate_id, candidate_name_id) party_code = ballot_counts[candidate]['party'] votes = ballot_counts[candidate]['votes'] pct = ballot_counts[candidate]['pct'] tpp_votes = ballot_counts[candidate]['tpp_votes'] tpp_pct = ballot_counts[candidate]['tpp_pct'] else: ballot_name = 'NULL' candidate_name_id = 'NULL' candidacy_id = 'NULL' party_code = 'NULL' votes = ballot_counts[candidate]['votes'] pct = ballot_counts[candidate]['pct'] tpp_votes = 'NULL' tpp_pct = 'NULL' sql = """ INSERT INTO results (election_id, electorate_id, candidate_name_id, candidacy_id, ballot_name, party_code, votes, pct, tpp_votes, tpp_pct) VALUES (%s, %s, %s, %s, """ % (election_id, electorate_id, candidate_name_id, candidacy_id) sql += '"%s", ' % ballot_name if ballot_name != 'NULL' else 'NULL, ' sql += '"%s", ' % party_code if party_code != 'NULL' else 'NULL, ' sql += "%s, %s, %s, %s)" % (votes, pct, tpp_votes, tpp_pct) self.db.execute(sql)
class ElectoratesTableBuilder(object): def __init__(self, host, user, database): self.db = SQLConnection(host=host, user=user, db=database) def build(self): self.base_dir = BASE_DIR + 'divisions/' self.create_electorates_table() states = utils.get_states() election_years = utils.get_election_years(base_dir=BASE_DIR) election_files = utils.get_election_files(BASE_DIR, election_years, 'house', states) for efile in election_files: self.parse_electorate_file(efile) def __del__(self): self.db.close() def _parse_counts_line(self, line): output = [] buff = '' while len(line) > 0: item = line.pop(0) if item.isdigit(): buff += item continue else: if len(buff) > 0: output.append(int(buff)) buff = '' continue return output def _to_int(self, string): return int(string.replace(',','').replace('.','')) def create_electorates_table(self): self.db.execute('DROP TABLE IF EXISTS electorates;') self.db.execute(""" CREATE TABLE electorates ( id INT NOT NULL AUTO_INCREMENT, election_id INT, state_code VARCHAR(3), electorate_name VARCHAR(30), enrollments INT, ballots INT, PRIMARY KEY(id) ); """ ) def parse_electorate_file(self, fileinfo): fname = fileinfo['fname'] state_code = fileinfo['state'].upper() election_id = utils.get_election_id(self.db, fileinfo['year'], fileinfo['chamber']) with open(fname, 'r') as f: lines = [line.strip() for line in f] breaks = [i for i,x in enumerate(lines) if '===' in x] electorate_chunks = [lines[breaks[i]-1:breaks[i+1]-1] for i in range(len(breaks[:-1]))] electorate_chunks.append(lines[breaks[-1]-1:]) for chunk in electorate_chunks[2:]: headline = re.split('\s\s\s+', chunk[0]) if len(headline) == 1: continue electorate_name = headline[0].split(',')[0].title() electorate_counts = self._parse_counts_line(re.split('[\s,]',headline[1])) electorate_enrolled = electorate_counts[0] if len(electorate_counts) > 1: electorate_ballots = electorate_counts[1] else: electorate_ballots = "NULL" sql = """ INSERT INTO electorates (election_id, state_code, electorate_name, enrollments, ballots) VALUES (%d, '%s', "%s", %d, %s) """ % (election_id, state_code, electorate_name, electorate_enrolled, electorate_ballots) self.db.execute(sql)
class PartiesTableBuilder(object): def __init__(self, host, user, database): self.db = SQLConnection(host=host, user=user, db=database) def build(self): self.create_parties_table() states = utils.get_states() election_years = utils.get_election_years(base_dir=BASE_DIR) house_files = utils.get_election_files(BASE_DIR, election_years, "house", states) senate_files = utils.get_election_files(BASE_DIR, election_years, "senate", states) party_dictionary = {} for fdata in house_files: self.parse_party_data(fdata, party_dictionary) for sdata in senate_files: self.parse_party_data(sdata, party_dictionary) self.insert_party_data(party_dictionary) def __del__(self): self.db.close() def _get_election_id(self, year, chamber): sql = "SELECT id FROM elections WHERE YEAR(election_date) = %s AND chamber = '%s'" % (year, chamber) id_raw = self.db.fetch(sql) return int(id_raw[0][0]) def create_parties_table(self): self.db.execute("DROP TABLE IF EXISTS parties;") self.db.execute( """ CREATE TABLE parties ( party_code VARCHAR(5), party_name VARCHAR(50), party_name_alt VARCHAR(50), PRIMARY KEY(party_code) ); """ ) def parse_party_data(self, fileinfo, party_dictionary): fname = fileinfo["fname"] with open(fname, "r") as f: lines = [line.strip() for line in f] breaks = [i for i, x in enumerate(lines) if "===" in x] if len(breaks) < 2: return party_chunk = lines[breaks[0] - 1 : breaks[1] - 1] for entry in party_chunk: if len(entry) == 0: continue elif entry[0] != "*": continue split_entry = re.split("\s+", entry) party_code = split_entry[1].upper() party_name = " ".join(split_entry[3:]) if party_code is None: continue if party_code not in party_dictionary: party_dictionary[party_code] = [party_name] elif party_name not in party_dictionary[party_code]: party_dictionary[party_code].append(party_name) def insert_party_data(self, party_dictionary): for party in party_dictionary: if len(party_dictionary[party]) == 1: party_name = party_dictionary[party][0] party_name_alt = "NULL" else: party_name = party_dictionary[party][0] party_name_alt = party_dictionary[party][1] if "Emergency Committee" in party: party = "Emergency Committee" sql = """ INSERT INTO parties (party_name, party_code, party_name_alt) VALUES ('%s', '%s', '%s') """ % ( party_name, party, party_name_alt, ) self.db.execute(sql)