예제 #1
0
def scrape_dupage():

    COUNTY_NAME = "DuPage"
    # sets URLs
    DUPAGE_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    # gets data
    data = requests.get(DUPAGE_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    # creates empty list for results info
    dupage_county_results = []

    for datum in data:

        if datum['CAT'] == "Propositions":
            options = datum['CH']
            votes = datum['V']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for option_index, (option, vote) in enumerate(zip(options, votes)):
                if option == "Yes/Sí":  # specific to DuPage
                    option = "Yes"
                else:
                    pass

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    "",
                    "middle_name":
                    "",
                    "last_name":
                    option.title(),
                    "vote_count":
                    int(vote),
                    "ballot_order":
                    int(option_index + 1)
                })

            dupage_county_results.append(race_obj)

        elif datum['CAT'] == "County" or datum['CAT'] == "Judicial":
            candidates = datum['CH']
            cand_votes = datum['V']
            cand_parties = datum['P']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                    zip(candidates, cand_votes, cand_parties)):
                if candidate == "Yes/Sí":  # specific to DuPage
                    candidate = "Yes"
                else:
                    pass

                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list
                first_name, middle_name, last_name = parse_name(full_name)

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    first_name,
                    "middle_name":
                    middle_name,
                    "last_name":
                    last_name,
                    "vote_count":
                    int(cand_vote),
                    "party":
                    cand_party,
                    "ballot_order":
                    int(cand_index + 1)
                })

            dupage_county_results.append(race_obj)

    with open('scrapers/dupage_data.json', 'w', encoding='utf-8') as f:
        json.dump(dupage_county_results, f, ensure_ascii=False, indent=4)

    return dupage_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_dupage()
def scrape_kendall():
    COUNTY_NAME = "Kendall"
    # sets URLs
    KENDALL_RACE_URL = 'https://results.co.kendall.il.us/'

    #gets data
    html = urllib.request.urlopen(KENDALL_RACE_URL).read()
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup)

    # creates empty list for results info
    kendall_county_results = []

    data = soup.find('pre').text
    precincts_total = 87
    rows = data.splitlines()
    # print(rows)

    for index, row in enumerate(rows):
        if row.startswith(" PRECINCTS"):
            precincts_reporting = int(row[-2:])

        if row == "COUNTY BOARD MEMBER-DIST.1":
            dist1_race_name = row

            dist1_race_obj = initialize_race_obj(dist1_race_name,
                                                 precincts_reporting,
                                                 precincts_total, COUNTY_NAME)

        if index >= 115 and index <= 119:  # will need to double-check this every once in a while, hard-coded
            cand_index = int(str(index)[-1:]) - 2
            cand_info, full_name, party = get_candidate_info(row)

            first_name, middle_name, last_name = parse_name(full_name)

            # votes = 99
            votes = get_vote_count(cand_info)

            formatted_candidate_info = get_candidates_in_race_obj(
                first_name, middle_name, last_name, votes, party, cand_index)

            dist1_race_obj["reporting_units"][0]['candidates'].append(
                formatted_candidate_info)

        if row == "COUNTY BOARD MEMBER-DIST.2":
            dist2_race_name = row

            dist2_race_obj = initialize_race_obj(dist2_race_name,
                                                 precincts_reporting,
                                                 precincts_total, COUNTY_NAME)

        if index >= 124 and index <= 129:  # will need to double-check this every once in a while, hard-coded
            cand_index = int(str(index)[-1:]) - 1
            cand_info, full_name, party = get_candidate_info(row)

            first_name, middle_name, last_name = parse_name(full_name)
            votes = get_vote_count(cand_info)

            formatted_candidate_info = get_candidates_in_race_obj(
                first_name, middle_name, last_name, votes, party, cand_index)

            dist2_race_obj["reporting_units"][0]['candidates'].append(
                formatted_candidate_info)

    kendall_county_results.append(dist1_race_obj)
    kendall_county_results.append(dist2_race_obj)

    with open('scrapers/kendall_data.json', 'w', encoding='utf-8') as f:
        json.dump(kendall_county_results, f, ensure_ascii=False, indent=4)

    return kendall_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_kendall()
예제 #3
0
def scrape_mchenry():

    COUNTY_NAME = "McHenry"
    # URLs
    MCHENRY_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    data = requests.get(MCHENRY_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    mchenry_county_results = []

    for datum in data:

        if datum['CAT'] == 'County' and datum['SUBCAT'] == 'Questions':
            options = datum['CH']
            votes = datum['V']
            race_name = datum['C']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for option_index, (option, vote) in enumerate(zip(options, votes)):

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    "",
                    "middle_name":
                    "",
                    "last_name":
                    option.title(),
                    "vote_count":
                    int(vote),
                    "ballot_order":
                    int(option_index + 1)
                })

            mchenry_county_results.append(race_obj)

        elif datum['CAT'] == "County" and datum['SUBCAT'] != "Questions":
            candidates = datum['CH']
            cand_votes = datum['V']
            cand_parties = datum['P']

            race_name = datum['C']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                    zip(candidates, cand_votes, cand_parties)):
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list
                first_name, middle_name, last_name = parse_name(full_name)

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    first_name,
                    "middle_name":
                    middle_name,
                    "last_name":
                    last_name,
                    "vote_count":
                    int(cand_vote),
                    "party":
                    cand_party,
                    "ballot_order":
                    int(cand_index + 1)
                })

            mchenry_county_results.append(race_obj)

    with open('scrapers/mchenry_data.json', 'w', encoding='utf-8') as f:
        json.dump(mchenry_county_results, f, ensure_ascii=False, indent=4)

    return mchenry_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_mchenry()
def scrape_lake():

    COUNTY_NAME = "Lake County"
    # sets URLs
    LAKE_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    # gets data
    data = requests.get(LAKE_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    # creates empty list for results info
    lake_county_results = []

    for datum in data:
        race_name = datum['C']
        candidates = datum['CH']
        cand_votes = datum['V']
        cand_parties = datum['P']

        race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                       precincts_total, COUNTY_NAME)

        for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                zip(candidates, cand_votes, cand_parties)):
            full_name = pp.parse(
                candidate,
                'person')  # uses probablepeople to parse names into a list
            first_name, middle_name, last_name = parse_name(full_name)

            # appends to candidates list
            race_obj["reporting_units"][0]['candidates'].append({
                "first_name":
                first_name,
                "middle_name":
                middle_name,
                "last_name":
                last_name,
                "vote_count":
                int(cand_vote),
                "ballot_order":
                int(cand_index + 1)
            })
            # print(race_obj)

        lake_county_results.append(race_obj)

        # print(lake_county_results)

    with open('scrapers/lake_data.json', 'w', encoding='utf-8') as f:
        json.dump(lake_county_results, f, ensure_ascii=False, indent=4)

    return lake_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_lake()
def scrape_cook():

    ## This scraper loops through the results txt data (SummaryExport.txt) and matches only with data from cook-IDs.csv.
    ## It only adds in the race_obj if the race name doesn't exist in `added`,
    ## which starts as an empty list. Within that for loop exists another for+if loop that loops through the
    ## `cook_county_results` list and adds the current race's candidate info.

    get_txtfile()

    COUNTY_NAME = "Cook County"
    cook_county_results = []
    added = []

    with open('scrapers/cook-IDs.csv', newline='') as f:
        reader = csv.reader(f)
        cook_info = list(reader)
    with open('scrapers/updated_cook.txt',
              'r') as r:  # should be name of newly-written file
        results_data = r.readlines()

    # This matches results races to dict races by the first seven characters of the record.
    for results_row in results_data:
        current_ID_match = results_row[0:7]  #RESULTS
        for info_line in cook_info:
            full_ID_match = info_line[0][0:7]  #CONTEXT

            if current_ID_match == full_ID_match:

                full_ID = info_line[0]
                race_name = info_line[1].title()
                candidate = info_line[2]
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list

                first_name, middle_name, last_name = parse_name(full_name)

                precincts_total = int(results_row[7:11])
                vote_count = int(results_row[11:18])
                precincts_reporting = int(results_row[18:22])
                cand_party = full_ID[22:25]
                ballot_order = int(info_line[0][4:7])

                if race_name not in added:
                    # creates object in format of race object for use in TribPub's Google Sheet
                    race_obj = initialize_race_obj(race_name,
                                                   precincts_reporting,
                                                   precincts_total,
                                                   COUNTY_NAME)
                    cook_county_results.append(race_obj)
                    added.append(race_name)
                else:
                    pass

                for item in cook_county_results:
                    if item['name'] == race_name.title():
                        first_name, middle_name, last_name = parse_name(
                            full_name)

                        item['reporting_units'][0]['candidates'].append({
                            "first_name":
                            first_name,
                            "middle_name":
                            middle_name,
                            "last_name":
                            last_name,
                            "vote_count":
                            int(vote_count),
                            "ballot_order":
                            int(ballot_order)
                        })
                    else:
                        pass
            else:
                pass

    # print(cook_county_results)

    with open('scrapers/cook_data.json', 'w', encoding='utf-8') as f:
        json.dump(cook_county_results, f, ensure_ascii=False, indent=4)

    return cook_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_cook()
def scrape_kane():

    COUNTY_NAME = "Kane County"
    kane_data = get_results_url()

    # creates empty list for results info
    kane_county_results = []

    race_data = kane_data.findAll(
        "h2"
    )  # h2 gets each race name, findPrevious/Next/Children is based on this
    for race in race_data:
        candidates = []
        votes = []

        # finds precincts reporting and total precincts
        finding_precincts_info = race.findPrevious('td')
        precincts_info = finding_precincts_info.findPrevious('td')
        precincts = list(map(int, re.findall(r'\d+', str(
            precincts_info))))  # gets integers from precincts line, makes list
        precincts_reporting = precincts[0]
        precincts_total = precincts[1]
        # print(precincts_reporting, precincts_total)

        cands = race.findNext('table')
        names = cands.findChildren('td')
        for name in names:
            name = str(name)
            if name.startswith('<td>'):
                # splits may be necessary to pinpoint just name
                # appends each name to candidates list
                if '(Write-In)' in name:
                    name = name.split('<b>', 2)
                    name_split = name[0]
                    cand_name_split = name_split.split('>', 2)
                    cand_name = cand_name_split[1]
                    candidates.append(cand_name)
                    # print('appended', cand_name)
                elif '(Independent)' in name or '(Democratic)' in name or '(Republican)' in name:
                    candidate_split = name.rsplit('(', 1)
                    candidate = candidate_split[0]
                    cand_name_split = candidate.split('>', 1)
                    cand_name = cand_name_split[1]
                    candidates.append(cand_name)
                    # print(cand_name)
                else:
                    name_split = name.split('>', 2)
                    name_split = str(name_split[1])
                    final_name = name_split.split('</', 2)
                    cand_name = final_name[0]
                    candidates.append(cand_name)
                    # print('appended', cand_name)
            if '<b>' in name:
                name_split = name.split('</b>', 2)
                name_split = str(name_split[0])
                final_name = name_split.split('<b>', 2)
                if '%' not in final_name[1]:
                    # separates vote percentages from vote counts
                    # appends votes to votes list
                    cand_votes = final_name[1]
                    votes.append(cand_votes)
                    # print('appended', cand_votes)

        race = str(race)
        race_split = race.split('<br/>', 2)
        race_split = race_split[0]
        final_race_name = race_split.split('>', 2)
        race_name = final_race_name[1]

        # creates object in format of race object for use in TribPub's Google Sheet
        race_obj = initialize_race_obj(race_name, precincts_reporting,
                                       precincts_total, COUNTY_NAME)

        for option_index, (candidate,
                           vote) in enumerate(zip(candidates, votes)):
            full_name = pp.parse(
                candidate,
                'person')  # uses probablepeople to parse names into a list
            first_name, middle_name, last_name = parse_name(full_name)

            race_obj["reporting_units"][0]['candidates'].append({
                "first_name":
                first_name,
                "middle_name":
                middle_name,
                "last_name":
                last_name,
                "vote_count":
                int(vote),
                "ballot_order":
                int(option_index + 1)
            })

        kane_county_results.append(race_obj)

    with open('scrapers/kane_data.json', 'w', encoding='utf-8') as f:
        json.dump(kane_county_results, f, ensure_ascii=False, indent=4)

    return kane_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_kane()