Python cache_request示例，common.cache_request Python示例

示例#1

0

显示文件

文件： main.py 项目： mail-my-ballot/elections-officials

def fetch_data():
    data = []
    text = cache_request(BASE_URL, verify=SSL_CERT)
    soup = BeautifulSoup(text, 'html.parser')
    for county in tqdm(soup.find('select', id='Counties')('option')):
        if not county.get('value'):
            continue
        county_text = cache_request(f'{BASE_URL}/SearchByCounty',
                                    method='POST',
                                    data={'CountyID': county.get('value')},
                                    wait=random_wait(),
                                    verify=SSL_CERT)
        county_soup = BeautifulSoup(county_text, 'html.parser')
        for jurisdiction_a in county_soup('a', class_='local-clerk-link'):
            qrystr_params = parse_qs(
                urlparse(jurisdiction_a.get('href')).query)
            jurisdiction_data = {
                k: v[0]
                for k, v in qrystr_params.items() if k != 'dummy'
            }
            jurisdiction_text = cache_request(f'{BASE_URL}/LocalClerk',
                                              method='POST',
                                              data=jurisdiction_data,
                                              wait=random_wait(),
                                              verify=SSL_CERT)
            jurisdiction_soup = BeautifulSoup(jurisdiction_text, 'html.parser')
            data.append(
                parse_jurisdiction(
                    jurisdiction_soup,
                    jurisdiction_data['jurisdictionName'],
                    county.text,
                    fipscode=jurisdiction_data['jurisdictionCode']))

    return data

示例#2

0

显示文件

文件： main.py 项目： mail-my-ballot/elections-officials

def fetch_data():
    data = []
    text = cache_request(BASE_URL)
    soup = BeautifulSoup(text, 'html.parser')
    for county_link in tqdm(soup.select('a[href^=countyInfo]')):
        text = cache_request(BASE_URL + county_link['href'])
        data.append(parse_county(BeautifulSoup(text, 'html.parser')))

    return data

示例#3

0

显示文件

def fetch_data():
  data = []
  text = cache_request(LIST_URL)
  soup = BeautifulSoup(text, 'html.parser')
  for county in tqdm(soup.find(id='idTown')('option')):
    wait = random.uniform(1, 3)
    text = cache_request(DETAIL_URL, method='POST', data={'idTown': county['value'], 'contactType': 'R'}, wait=wait)
    data.append(parse_county(BeautifulSoup(text, 'html.parser')))

  return data

示例#4

0

显示文件

文件： main.py 项目： mail-my-ballot/elections-officials

def fetch_data():
    data = []
    text = cache_request('https://www.elections.ny.gov/CountyBoards.html')
    soup = BeautifulSoup(text, 'html.parser')

    for county_area in soup.find_all('area'):
        county_link = county_area['href']
        text = cache_request(county_link)
        data.append(parse_county(BeautifulSoup(text, 'html.parser')))

    return data

示例#5

0

显示文件

def fetch_data(verbose=True):
  data = []
  text = cache_request(BASE_URL)
  soup = BeautifulSoup(text, 'html.parser')

  for county_area in tqdm(soup.find_all('area'), disable=not verbose):
    county_link = county_area['href']
    text = cache_request(county_link)
    data.append(parse_county(BeautifulSoup(text, 'html.parser')))

  return data

示例#6

0

显示文件

def fetch_data(verbose=True):  # pylint: disable=unused-argument
    # extract __viewstate and other form params for .aspx
    page = cache_request(BASE_URL)
    soup = BeautifulSoup(page, 'lxml')
    form = soup.find('form')
    form_data = {
        form_input['name']: form_input['value']
        for form_input in form('input')
    }
    form_data['ctl00$MainContentPlaceHolder$PPReport'] = 'rdoCsv'

    csv_text = cache_request(BASE_URL, method='POST', data=form_data)
    return parse_csv(csv_text)

示例#7

0

显示文件

文件： parse_pdf.py 项目： kevanloy/elections-officials

def parse_pdf():
  html = cache_request(BASE_URL)
  soup = BeautifulSoup(html, 'html.parser')
  pdf_url = soup.find('a', text=re.compile('^WI Municipal Clerks'))['href']
  req = cache_request(pdf_url, is_binary=True)
  with BytesIO(req) as pdf_bytes:
    pdf_reader = PyPDF2.PdfFileReader(pdf_bytes)
    records = []
    for page_num in tqdm(range(pdf_reader.numPages)):
      text = pdf_reader.getPage(page_num).extractText()
      for city_chunk in re_city_chunk.findall(text):
        records.append(parse_city(city_chunk))
  return records

示例#8

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_data(verbose=True):  # pylint: disable=unused-argument
  text = cache_request('https://www.sos.ks.gov/elections/county_election_officers_all.aspx')
  soup = BeautifulSoup(text, 'html.parser')

  # Remove first labels row.
  raw_rows = soup.body.find_all('tr')[1:]

  counties = []
  for table_rows in raw_rows:
    county = {}

    items = table_rows.find_all('font')

    county['county'] = items[0].text + ' County'
    county['locale'] = county['county']
    county['officer'] = items[1].text
    county['emails'] = [items[2].text]
    # Index 3 is hours of operation, which we don't care about.
    county['phones'] = [items[4].text.replace('\n', '').replace('(', '').replace(')', '-')]
    county['faxes'] = [items[5].text.replace('\n', '').replace('(', '').replace(')', '-')]
    # Only some counties have a second address line.
    address_line2 = items[7].text
    if len(address_line2) > 1:
      address_line2 = ' ' + address_line2 + ' '
    address = (items[6].text
               + address_line2
               + items[8].text
               + ' '
               + items[9].text
               + items[10].text)
    county['address'] = address.replace('\n', ' ').replace('\xa0', ' ')[:-1]

    counties.append(county)

  return counties

示例#9

0

显示文件

def fetch_data(verbose=True):  # pylint: disable=unused-argument
    data = defaultdict(list)

    text = cache_request(URL)
    soup = BeautifulSoup(text, 'html.parser')

    emails = soup.find_all('a', href=re.compile(r'^mailto:'))
    for email in emails:
        email_address = ''
        if '@' in email.text:
            email_address = email.text
        else:
            email_address = email['href'].replace('mailto:', '')
        data['emails'].append(email_address)

    for text in [p.text for p in soup.find_all('p')]:
        if 'Fax' in text:
            data['faxes'].append(extract_phone_number(text))
        elif 'Phone' in text or 'Toll-Free' in text:
            data['phones'].append(extract_phone_number(text))
        elif 'Absentee and Petition Office' in text:
            lines = text.split('\n')
            data['address'] = (lines[1] + lines[2]).replace('\r', ' ')

    data['locale'] = 'All'

    return [dict(data)]

示例#10

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def query_clerk_data(pdf_data, verbose=True):
    clerk_data = []
    for pdf_datum in tqdm(pdf_data, disable=not verbose):
        for field in ['municipal_address', 'mailing_address']:
            if pdf_datum.get(field):
                street, city, _, zipcode = re_addr.search(
                    pdf_datum.get(field)).groups()
                post_data = {
                    'addressLine': street,
                    'unit': "",
                    'city': city,
                    'zip': zipcode
                }
                result = cache_request(POST_URL,
                                       method='POST',
                                       data=post_data,
                                       wait=random.uniform(.1, .3))
                json_data = json.loads(result).get('Data') or {}
                if json_data.get('clerk'):
                    for phfax_field in ['fax', 'phone1', 'phone2'
                                        ]:  # remove invalid phones/faxes
                        phfax_match = re_to_e164.findall(
                            json_data['clerk'].get(phfax_field, ''))
                        json_data['clerk'][phfax_field] = ''.join(
                            phfax_match[0]) if phfax_match else None
                    clerk_data.append(json_data['clerk'])
    return clerk_data

示例#11

0

显示文件

def main():
    req = cache_request(
        "https://sos.wyo.gov/Elections/Docs/WYCountyClerks_AbsRequest_VRChange.pdf",
        is_binary=True)
    with BytesIO(req) as fh:
        pdf_reader = PyPDF2.PdfFileReader(fh)
        text = ''
        for page_num in tqdm(range(pdf_reader.numPages)):
            text += pdf_reader.getPage(page_num).extractText()

    # Remove Page # of #
    text = re.sub(r"\sPage\s\n\d\sof\s\n\d\s", "", text)

    lines = make_lines(text)

    # Remove first couples of lines which are PDF header.
    lines = lines[9:]

    rows = group_rows(lines)

    grouped_rows = group_counties(rows)

    counties = generate_county_dict_list(grouped_rows)

    with open('public/wyoming.json', 'w') as f:
        json.dump(counties, f)

示例#12

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_data(verbose=True):  # pylint: disable=unused-argument
    html = cache_request(BASE_URL)
    soup = BeautifulSoup(html, 'html.parser')
    return [
        parse_row(row) for row in soup.select(
            'table#ctl00_ContentPlaceHolder1_rgCountyAuditors_ctl00 tbody tr')
    ]

示例#13

0

显示文件

def get_locality_datum(id_):
    page = cache_request(
        'https://vote.elections.virginia.gov/VoterInformation/PublicContactLookup',
        method='POST',
        data={'LocalityUid': id_},
        wait=2,
    )
    soup = BeautifulSoup(page, 'lxml')
    keys = soup.select('.resultsWrapper')[0].select('h5.display-lable')
    vals = soup.select('.resultsWrapper')[0].select('p.display-field')
    results = {
        key.text.strip(): val.text.strip()
        for key, val in zip(keys, vals)
    }
    locale = soup.select(
        'select > option[selected="selected"]')[0].text.title()
    final = {
        'locale': locale,
        'county': locale if locale.endswith('County') else None,
        'city': locale if not locale.endswith('County') else None,
        'official': results['Registrar'],
        'emails': [results['Email']],
        'faxes': [results['Fax']],
        'url': results.get('URL'),
        'address': results.get('Mailing Address') or results.get('Address'),
        'physicalAddress': results.get('Physical Address'),
    }
    return {k: v for k, v in final.items() if v}

示例#14

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def get_locality_ids():
    page = cache_request(BASE_URL)
    soup = BeautifulSoup(page, 'lxml')
    return [
        option['value'] for option in soup.select('select>option')
        if option['value']
    ]

示例#15

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_data(verbose=True):  # pylint: disable=unused-argument
    text = cache_request(BASE_URL)
    soup = BeautifulSoup(text, 'lxml')

    counties = soup.select('div.field-items>div.field-item div.col-sm-6')
    data = [parse_county(county) for county in counties]
    assert len(data) == 93

    return data

示例#16

0

显示文件

def fetch_emails():
  html = cache_selenium(EMAIL_LINK_URL)
  soup = BeautifulSoup(html, 'html.parser')
  xlsx_url = soup('a', text=re.compile(r'county.*e-?mail', re.IGNORECASE))[0]['href']
  xlsx = cache_request(xlsx_url, is_binary=True)
  emails = pd.read_excel(xlsx).fillna(method='ffill').apply(lambda x: x.str.strip())
  emails = emails.rename(columns={'Email': 'emails'})
  emails['locale'] = emails['County'].str.title() + ' County'
  return emails.groupby('locale')['emails'].apply(list)

示例#17

0

显示文件

def fetch_data():
    text = cache_request(BASE_URL)
    soup = BeautifulSoup(text, 'lxml')

    counties = soup.select('div.field-items>div.field-item div.col-sm-6')
    data = [parse_county(county) for county in counties]
    assert len(data) == 93

    return data

示例#18

0

显示文件

def fetch_data():
    data = []
    text = cache_request('https://azsos.gov/county-election-info')
    soup = BeautifulSoup(text, 'html.parser')
    for county in soup('div', id=re.compile('^county_info_')):
        if county.find('h2'):  # there are extra blank divs
            data.append(parse_county(county))

    return data

示例#19

0

显示文件

def fetch_data():
    text = cache_request(BASE_URL)
    if text.startswith('<plaintext>'):
        text = text[len('<plaintext>'):]
    reader = csv.reader(StringIO(text), delimiter='|')
    csv_data = [line for line in reader if line]

    json_data = [record(datum) for datum in csv_data]
    return json_data

示例#20

0

显示文件

def get_locality_ids():
    page = cache_request(
        'https://vote.elections.virginia.gov/VoterInformation/PublicContactLookup'
    )
    soup = BeautifulSoup(page, 'lxml')
    return [
        option['value'] for option in soup.select('select>option')
        if option['value']
    ]

示例#21

0

显示文件

def parse_pdf():
    html = cache_request(BASE_URL)
    soup = BeautifulSoup(html, 'html.parser')
    pdf_url = soup.find('a', text=re.compile('^WI Municipal Clerks'))['href']
    text = fetch_pdf_text(pdf_url)
    return [
        parse_city(city_chunk)
        for city_chunk in tqdm(re_city_chunk.findall(text))
    ]

示例#22

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def parse_email_list(email_list_url):
    text = cache_request(email_list_url)
    soup = BeautifulSoup(text, 'html.parser')
    email_by_county = {}
    for row in soup.find('tbody')('tr'):
        county_name = row.find('td').text
        email_tag = row.find('a')
        if email_tag is not None:
            email_by_county[county_name] = email_tag.text
    return email_by_county

示例#23

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_data(verbose=True):
    html = cache_request(BASE_URL)
    soup = BeautifulSoup(html, 'lxml')
    counties = [
        option['value'] for option in soup.select('select>option')
        if option['value']
    ]
    return [
        fetch_county(county_name)
        for county_name in tqdm(counties, disable=not verbose)
    ]

示例#24

0

显示文件

def fetch_data(verbose=True):  # pylint: disable=unused-argument
  html = cache_request(URL)
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.find(class_='dcboeContent').get_text('\n')
  faxes = re_fax.findall(text)
  return [{
    'locale': 'All',
    'emails': re_email.findall(text),
    'phones': [phone for phone in re_phone.findall(text) if phone not in faxes],
    'faxes': faxes,
  }]

示例#25

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def get_pdf_text(url):
    req = cache_request(url, is_binary=True)
    with BytesIO(req) as pdf_file:
        with StringIO() as output:
            manager = PDFResourceManager()
            converter = TextConverter(manager, output, laparams=LAParams())
            interpreter = PDFPageInterpreter(manager, converter)
            for page in PDFPage.get_pages(pdf_file, set()):
                interpreter.process_page(page)
            converter.close()
            text = output.getvalue()
    return text

示例#26

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_data(verbose=True):  # pylint: disable=unused-argument
    text = cache_request(BASE_URL)
    soup = BeautifulSoup(text, 'lxml')
    counties = soup.select('h2.contentpage-h2 a')

    data = []
    for county in counties:
        data_id = county['data-target'].split('#')[1]
        datum = soup.find(id=data_id)
        data.append(parse_county(county, datum))

    return data

示例#27

0

显示文件

文件： main.py 项目： mail-my-ballot/elections-officials

def fetch_data():
  html = cache_request(BASE_URL)
  soup = BeautifulSoup(html, 'html.parser')
  pdf_url = urljoin(BASE_URL, soup.find('a', text=re.compile('^List of County Election Boards'))['href'])
  email_list_url = urljoin(BASE_URL, soup.find('a', text=re.compile('^County Election Board Email Addresses'))['href'])

  pdf_data = parse_pdf(pdf_url)
  email_data = parse_email_list(email_list_url)
  assert len(pdf_data) == len(email_data)

  for county_name, email in email_data.items():
    pdf_data[county_name]['emails'] = [email]

  return list(pdf_data.values())

示例#28

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def fetch_county(county_name):
    locale = f"{county_name} County"
    html = cache_request(f"{BASE_URL}?County={county_name}",
                         wait=random.uniform(.5, 1.5))
    soup = BeautifulSoup(html, 'lxml')
    table = soup.select_one('table#data')
    text = re_dense_lines.sub('\n', table.get_text('\n'))
    url = re_url.findall(text)
    return {
        'locale': locale,
        'county': locale,
        'official': re_official.findall(text)[0],
        'emails': re_email.findall(text),
        'faxes': re_fax.findall(text),
        'phones': re_phone.findall(text),
        'url': url[0] if url else None,
        'address': parse_addr(re_mailing_addr.findall(text)),
        'physicalAddress': parse_addr(re_physical_addr.findall(text)),
    }

示例#29

0

显示文件

文件： main.py 项目： kevanloy/elections-officials

def main():
    text = cache_request(
        'https://elections.maryland.gov/about/county_boards.html')
    soup = BeautifulSoup(text, 'lxml')
    counties = soup.select('div.mdgov_contentWrapper > p')

    # lines = [line for line in line_gen(counties[1].children)]
    data = []
    for county in counties:
        lines = list(county.children)
        href_datum = find_hrefs(lines)
        url_datum = {
            'url': href_datum['urls'][0]
        } if href_datum['urls'] else {}

        geo = lines[0].text.strip()
        if geo.endswith('City'):
            geo_datum = {
                'locale': geo + ':',
                'city': geo,
            }
        else:  # for Baltimore
            county = geo if geo.endswith(' County') else geo + ' County'
            geo_datum = {
                'locale': ':' + county,
                'county': county,
            }

        datum = {
            **geo_datum,
            'official': find_re(election_director_re, lines),
            'phones': find_re(phone_re, lines, find_all=True),
            'faxes': [find_re(fax_re, lines)],
            **url_datum,
            **href_datum,
        }

        assert datum['emails']
        assert find_re(fax_re, lines)
        data += [datum]

    with open('public/maryland.json', 'w') as fh:
        json.dump(data, fh)

示例#30

0

显示文件

文件： main.py 项目： vote-by-mail/election-official-data

def get_locality_datum(id_, attempt=0):
    if attempt > 5:
        return {}
    page = cache_request(
        BASE_URL,
        method='POST',
        data={'LocalityUid': id_},
        wait=random.uniform(2, 3),
    )
    soup = BeautifulSoup(page, 'lxml')
    results_wrappers = soup.select('.resultsWrapper')
    if results_wrappers:
        keys = results_wrappers[0].select('h5.display-lable')
        vals = results_wrappers[0].select('p.display-field')
        results = {
            key.text.strip(): val.text.strip()
            for key, val in zip(keys, vals)
        }
        locale = soup.select(
            'select > option[selected="selected"]')[0].text.title()
        final = {
            'locale': locale,
            'county': locale if locale.endswith('County') else None,
            'city': locale if not locale.endswith('County') else None,
            'official': results['Registrar'],
            'emails': to_list(results['Email']),
            'faxes': to_list(results['Fax']),
            'url': results.get('URL'),
            'address': results.get('Mailing Address')
            or results.get('Address'),
            'physicalAddress': results.get('Physical Address'),
        }
        return {k: v for k, v in final.items() if v}
    cached_files = glob.glob(os.path.join(work_dir, '*.*'))
    latest_file = max(cached_files, key=os.path.getctime)
    logging.warning(
        "error in Virginia file; deleting cached file %s; retry after wait",
        latest_file)
    os.remove(latest_file)
    sleep(30)
    return get_locality_datum(id_, attempt + 1)