def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}' .format(item['CommitteeMeetingStatusName']) ) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId']) ) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event(name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore" ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link( 'Video of Hearing', video[0].xpath('@href')[0], 'text/html' ) if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape_upper(self): listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm' html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split('<hr />')[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, 'Date:') when_time = self.row_content(page, 'Time:') location = self.row_content(page, 'Room:') location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # com = self.row_content(page, 'Committee:') com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0] com = com.split(', Senator')[0].strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source(listing_url) event.add_participant( com, type='committee', note='host', ) for bill_table in page.xpath('//table[@width="85%" and @border="0"]'): bill_link = '' if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath('string(tr[2])').strip() agenda_item = event.add_agenda_item(description=agenda_line) bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath('string(tr[1])').strip() agenda_item = event.add_agenda_item(description=agenda_line) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo['COMMITTEE'] where = metainfo['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo['PLACE'] = where.strip() metainfo['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type='committee', note='host') if chair is not None: event.add_participant(chair, type='legislator', note='chair') for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape_event_page(self, url, event_type): page = self.lxmlize(url) page.make_links_absolute('https://malegislature.gov/') title = page.xpath('string(//div[contains(@class,"followable")]/h1)') title = title.replace('Hearing Details', '').strip() title = title.replace('Special Event Details', '') start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip() start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip() location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip() description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip() start_date = self._TZ.localize( dateutil.parser.parse( '{} {}'.format(start_day, start_time), ) ) event = Event( start_date=start_date, name=title, location_name=location, description=description ) event.add_source(url) agenda_rows = page.xpath( '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]' '/div/div/div[contains(@class,"panel-default")]') for row in agenda_rows: # only select the text node, not the spans agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip() if agenda_title == '': agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip() agenda = event.add_agenda_item(description=agenda_title) bills = row.xpath('.//tbody/tr/td[1]/a/text()') for bill in bills: bill = bill.strip().replace('.', ' ') agenda.add_bill(bill) if event_type == 'Hearing': event.add_participant( title, type='committee', note='host', ) yield event
def scrape(self): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" # when is now of the following format: # Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath(".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") name = description_lines[0].text_content() desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+', " ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event(name=name, description=description, start_date=self._tz.localize(when), location_name=location, classification=event_type, ) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: continue bill = bill[0] bill_desc = b.text_content().replace(bill, "").strip(", ").strip() ses, num = bill.split("-") bill = ses.replace(" ", "") + "-" + num.zfill(4) item = e.add_agenda_item(bill_desc) item.add_bill(bill) e.add_source(calendar_url) if committee: e.add_participant(committee, type='organization', note='host') yield e
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize(datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath('text()')[0], url=url, media_type='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", {"chamber": "unknown", "name": committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill( bill['bill_id'], note=bill['type'] ) yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, 'Committee:') when_date = self.table_row_content(page, 'Date:') when_time = self.table_row_content(page, 'Time:') location = self.table_row_content(page, 'Location:') if 'house hearing room' in location.lower(): location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # fix some broken times, e.g. '12 :00' when_time = when_time.replace(' :', ':') # some times have extra info after the AM/PM if 'upon' in when_time: when_time = when_time.split('AM', 1)[0] when_time = when_time.split('PM', 1)[0] start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx') event.add_participant( com, type='committee', note='host', ) # different from general MO link xpath due to the <b> house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \ 'or contains(@href, "bill.aspx")]/b/text()' for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split('--')[0].strip() bill_no = bill_no.replace('HCS', '').strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant(ctty_name, 'organization') bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath('string(Sponsor)').strip() committee_name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]['name'] ) name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], row.xpath('string(Title)').strip() ) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == '': name = committee_name location = row.xpath('string(Location)').strip() # events with no location all seem to be committee hearings if location == '': location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801' start_date = dateutil.parser.parse(row.xpath('string(Schedule)')) # todo: do i need to self._TZ.localize() ? event = Event( start_date=start_date, name=name, location_name=location ) event.add_source('http://w3.akleg.gov/index.php#tab4') event.add_participant( committee_name, type='committee', note='host', ) for item in row.xpath('Agenda/Item'): agenda_desc = item.xpath('string(Text)').strip() if agenda_desc != '': agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath('BillRoot'): bill_id = item.xpath('string(BillRoot)') # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r'\s+', ' ', bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_chamber(self, chamber): url = utils.urls['events'][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath('tr'): time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = row.xpath( 'td[@class="CMS-MeetingDetail-Agenda"]/div/div' )[-1].text_content().strip() location = row.xpath( 'td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip() committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a') bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_time = datetime.datetime.strptime( '{} {}'.format(date_string, time_string), '%m/%d/%Y %I:%M %p', ) except ValueError: break event = Event( name=description, start_time=self._tz.localize(start_time), location_name=location, timezone=self._tz.zone, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn'])) for committee in committees: parsed = urllib.parse.urlparse(committee.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r' \([S|H]\)$', '', committee.text), id=qs.get('Code'), ) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record['CommHouse']] except KeyError: self.warning('unknown committee code %s, skipping', record['CommHouse']) description = 'Meeting of the {}'.format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record['Location'] or 'Statehouse', ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill['bill_id']) event.add_committee( hr_name, id=record['CommHouse'], note='host', ) event.add_source('http://www.njleg.state.nj.us/downloads.asp') yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date),) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event( name=desc, start_date=date, location_name=committee_name, ) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' item = event.add_agenda_item('consideration') item.add_bill(bill_id, note=type_) event.add_person(committee_name + ' Committee', note='host') event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') yield event
def scrape_chamber(self, chamber, session): cha = {"upper": "7", "lower": "3", "other": "4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now+timedelta(days=30)).strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading, content = meeting.xpath("./ul/li") who, when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") location = (where or '').strip() or "unknown" event = Event(name=meeting_title, start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=location, description=meeting_title) event.add_participant(who.strip(), type='committee', note='host') event.add_source(url) # only scraping public hearing bills for now. bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li" "[contains(@class, 'visible-lg')]") for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") item = event.add_agenda_item(descr.strip()) item.add_bill(bill_id.strip()) yield event
def scrape(self): start = dt.datetime.utcnow() start = start - dt.timedelta(days=10) end = start + dt.timedelta(days=30) url = URL.format(**{"from": start.strftime("%Y/%m/%d"), "til": end.strftime("%Y/%m/%d")}) page = self.lxmlize(url) events = page.xpath("//ul[contains(@class, 'committee-events')]//li") for event in events: string = event.text_content() po = CLICK_INFO.match(event.xpath(".//span")[0].attrib["onclick"]) if po is None: continue poid = po.groupdict()["info_id"] # This is used to get more deetz on popage = self.popOverUrl(poid) when = dt.datetime.strptime(popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p") who = popage.xpath("//h1")[0].text related = [] for item in popage.xpath("//div"): t = item.text if t is None: continue t = t.strip() for related_entity in ORD_INFO.findall(t): related.append({"ord_no": related_entity, "what": t}) e = Event(name=who, when=when, location="unknown") e.add_source(url) for o in related: i = e.add_agenda_item(o["what"]) i.add_bill(o["ord_no"], note="consideration") yield e
def scrape(self): method = 'events/?state={}&dtstart=1776-07-04'.format(self.state) self.events = self.api(method) seen = set() for event in self.events: begin = self._date_parse(event.pop('when')) end = self._date_parse(event.pop('end')) all_day = event.pop('all_day',False) e = Event(name=event.pop('description'), classification=event.pop('type'), location_name=event.pop('location'), timezone=event.pop('timezone'), start_time=begin, end_time=end, all_day=all_day,) if len(e.name) >= 300: e.name = e.name[:290] if len(e.location['name']) >= 100: e.location['name'] = e.location['name'][:90] composite_key = (e.name, e.description, e.start_time) if composite_key in seen: print("Duplicate found: %s/%s/%s" % (composite_key)) continue seen.add(composite_key) for source in event.pop('sources'): if 'retrieved' in source: source.pop('retrieved') e.add_source(**source) if e.sources == []: continue ignore = ['country', 'level', 'state', 'created_at', 'updated_at', 'notes', '+location_url', 'session', 'id', '+chamber', '+agenda', '+cancelled', '+media_contact', '+contact', '+details'] # +agenda: # Agenda on old (very old) OpenStates data is actually a string # and not any sort of structured data we can use in the items # schema, and is only present for a handful of events. for i in ignore: if i in event: event.pop(i) for link in ['+link', 'link']: if link in event: e.add_source(url=event.pop(link)) for p in event.pop('participants', []): type_ = { "committee": "organization", "legislator": "person", None: None, }[p.get('participant_type')] if type_ is None: # Garbage data. continue e.add_participant(name=p['participant'], note=p['type'], type=type_,) for b in event.pop('related_bills', []): item = e.add_agenda_item( b.pop('description', b.pop('+description', None))) item.add_bill(bill=b['bill_id'], note=b.pop('type', b.pop('+type', None))) seen_documents = set([]) for document in event.pop('documents', []): if document['url'] in seen_documents: print("XXX: Buggy data in: Duped Document URL: %s (%s)" % ( document['url'], document['name'] )) continue seen_documents.add(document['url']) e.add_document(url=document['url'], note=document['name']) assert event == {}, "Unknown fields: %s" % ( ", ".join(event.keys()) ) yield e
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize( date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format( start.year, start.month, start.day) org_name = event['meeting'] e = Event(name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status'])) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in [ 'Agenda Published', 'Minutes Published' ] def is_council(event): return True if event[ 'meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council( event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list( self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def is_vote_event(item): return True if item['type'] == 'ACTION' else False def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') def is_being_introduced(item, event): org_name = event['meeting'] identifier = item['identifier'] # `org_code` is two-letter code for committee current_org_code = self.committees_by_name.get( org_name)[0]['code'] originating_org_code = re.search( r'([A-Z]{2})[0-9]+\.[0-9]+', identifier).group(1) return current_org_code == originating_org_code if is_vote_event(item): wards = normalize_wards(item['wards']) identifier_regex = re.compile( r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier'] ] a.add_bill(full_identifier) if is_being_introduced(item, event): b = Bill( # TODO: Fix this hardcode legislative_session='2014-2018', identifier=full_identifier, title=item['title'], from_organization={'name': org_name}, ) b.add_source(agenda_url) b.add_document_link( note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format( full_identifier)) b.extras = { 'wards': wards, } yield b yield e
def scrape(self): web_results = self.scrapeWebCalendar() for event in self.events(): # Create a key for lookups in the web_results dict. key = (event['EventBodyName'].strip(), self.toTime(event['EventDate']).date(), event['EventTime']) web_event_dict = web_results.get( key, { 'Meeting Details': 'Meeting\xa0details', 'Audio': 'Not\xa0available', 'Recap/Minutes': 'Not\xa0available' }) body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_time=event["start"], timezone=self.TIMEZONE, description='', location_name=event["EventLocation"], status=status) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event_dict['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event_dict['Audio']['url']).headers['Location'] e.add_media_link(note=web_event_dict['Audio']['label'], url=redirect_url, media_type='text/html') if web_event_dict['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event_dict['Recap/Minutes']['label'], url=web_event_dict['Recap/Minutes']['url'], media_type="application/pdf") if web_event_dict['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event_dict['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event_dict['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape_meeting(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//a[@id='linkTitle']//text()")[0] date = page.xpath("//span[@id='lDate']/text()")[0] time = page.xpath("//span[@id='lTime']/text()")[0] location = page.xpath("//span[@id='lLocation']/text()")[0] substs = { "AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"], } for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r'(?i)\d[AP]M$', time): time = time[:-2] + " " + time[-2:] if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = datetime.datetime.strptime("%s %s" % ( date, time ), "%B %d, %Y %I:%M %p") # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = {"house": "lower", "senate": "upper", "joint": "legislature"} for chamber_ in chambers.keys(): if chamber_ in title.lower(): break else: return event = Event(name=description, start_date=self._tz.localize(when), location_name=location, all_day=all_day) event.add_source(url) event.add_participant(title, note='host', type='committee') trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title or "H" in bill_title: item = event.add_agenda_item(descr.text_content()) item.add_bill(bill_title) else: continue yield event
def scrape_meeting(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//a[@id='linkTitle']//text()")[0] date = page.xpath("//span[@id='lDate']/text()")[0] time = page.xpath("//span[@id='lTime']/text()")[0] location = page.xpath("//span[@id='lLocation']/text()")[0] substs = {"AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"]} for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r"(?i)\d[AP]M$", time): time = time[:-2] + " " + time[-2:] if re.search("UPON ADJ|TBA", " ".join(time.split()).upper()): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = datetime.datetime.strptime( "%s %s" % (date, time), "%B %d, %Y %I:%M %p" ) # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = {"house": "lower", "senate": "upper", "joint": "legislature"} for chamber_ in chambers.keys(): if chamber_ in title.lower(): break else: return event = Event( name=description, start_date=self._tz.localize(when), location_name=location, all_day=all_day, ) event.add_source(url) event.add_participant(title, note="host", type="committee") trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title or "H" in bill_title: item = event.add_agenda_item(descr.text_content()) item.add_bill(bill_title) else: continue yield event
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower(): participating_orgs = [e.name] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day) org_name = event['meeting'] e = Event( name = org_name, start_time = start, timezone = tz.zone, location_name = event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name = org_name, type = 'organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') wards = normalize_wards(item['wards']) identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session = '2014-2018', identifier = full_identifier, title = item['title'], from_organization = {'name': self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier)) b.extras = { 'wards': wards, } self.seen_agenda_items.append(full_identifier) yield b yield e
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for api_event, event in self.events(n_days_ago): description = api_event["EventComment"] when = api_event["start"] location = api_event["EventLocation"] if location == "Council Chambers": location = "Council Chambers, 5th Floor, City-County Building, " \ "414 Grant Street, Pittsburgh, PA 15219" if not location : continue status_string = api_event["status"] if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ("rescheduled to", "postponed to", "reconvened to", "rescheduled to", "meeting recessed", "recessed meeting", "postponed to", "recessed until", "deferred", "time change", "date change", "recessed meeting - reconvene", "cancelled", "new date and time", "rescheduled indefinitely", "rescheduled for",)) : status = "cancelled" elif status_text in ("rescheduled", "recessed") : status = "cancelled" elif status_text in ("meeting reconvened", "reconvened meeting", "recessed meeting", "reconvene meeting", "rescheduled hearing", "rescheduled meeting",) : status = api_event["status"] elif status_text in ("amended notice of meeting", "room change", "amended notice", "change of location", "revised - meeting date and time") : status = api_event["status"] elif "room" in status_text : location = status_string[1] + ", " + location elif status_text in ("wrong meeting date",): continue else : print(status_text) status = api_event["status"] else : status = api_event["status"] if event["Meeting Name"] == "Post Agenda": event_name = "Agenda Announcement" elif event["Meeting Name"] == "City Council": event_name = "Regular meeting" else: event_name = event["Meeting Name"] if description: e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) else: e = Event(name=event_name, start_date=when, location_name=location, status=status) e.pupa_id = str(api_event["EventId"]) if event["Meeting video"] != "Not\xa0available": if "url" not in event["Meeting video"]: pass else: video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) e.add_media_link(note="Recording", url=video_url, type="recording", media_type="text/html") self.addDocs(e, event, "Published agenda") self.addDocs(e, event, "Published minutes") participant = event["Meeting Name"] if participant == "City Council" or participant == "Post Agenda": participant = "Pittsburgh City Council" e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): clean_title = self.clean_agenda_item_title(item["EventItemTitle"]) agenda_item = e.add_agenda_item(clean_title) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemVideo"] and event["Meeting video"] != "Not\xa0available": item_video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) + \ '?view_id=2&meta_id=' + str(item["EventItemVideo"]) agenda_item.add_media_link(note="Recording", url=item_video_url, type="recording", media_type="text/html") participants = set() for call in self.rollcalls(api_event): if call["RollCallValueName"] == "Present": participants.add(call["RollCallPersonName"]) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event), note="api") try: detail_url = event["Meeting Details"]["url"] except TypeError: e.add_source(self.EVENTSPAGE, note="web") else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note="web") yield e
def scrape_agenda(self, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf["DATE:"] time = metainf["TIME:"] where = metainf["PLACE:"] # check for duration in time if " - " in time: start, end = time.split(" - ") am_pm_srch = re.search("(?i)(am|pm)", end) if am_pm_srch: time = " ".join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M" ] event_desc = "Meeting Notice" if "Rise" in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper(): return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(name=event_desc, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib["href"] event.add_document(bill.text_content(), bill_ft, media_type="application/pdf") root = bill.xpath("../../*") root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = (bill.getparent().getparent().getparent().getnext(). getnext().text_content()) for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill.text_content()) committee = page.xpath("//span[@id='lblSession']")[0].text_content() event.add_participant(committee, "committee", note="host") yield event
def scrape(self): method = 'events/?state={}&dtstart=1776-07-04'.format(self.state) self.events = self.api(method) seen = set() for event in self.events: e = Event(name=event.pop('description'), classification=event.pop('type'), location=event.pop('location'), timezone=event.pop('timezone'), start_time=self._date_parse(event.pop('when')), end_time=self._date_parse(event.pop('end')),) if len(e.name) >= 300: e.name = e.name[:290] if len(e.location['name']) >= 100: e.location['name'] = e.location['name'][:90] composite_key = (e.name, e.description, e.start_time) if composite_key in seen: print("Duplicate found: %s/%s/%s" % (composite_key)) continue seen.add(composite_key) for source in event.pop('sources'): if 'retrieved' in source: source.pop('retrieved') e.add_source(**source) if e.sources == []: continue ignore = ['country', 'level', 'state', 'created_at', 'updated_at', 'notes', '+location_url', 'session', 'id', '+chamber', '+agenda', '+cancelled', '+media_contact', '+contact', '+details'] # +agenda: # Agenda on old (very old) OpenStates data is actually a string # and not any sort of structured data we can use in the items # schema, and is only present for a handful of events. for i in ignore: if i in event: event.pop(i) for link in ['+link', 'link']: if link in event: e.add_source(url=event.pop(link)) for p in event.pop('participants', []): type_ = { "committee": "organization", "legislator": "person", None: None, }[p.get('participant_type')] if type_ is None: # Garbage data. continue e.add_participant(name=p['participant'], note=p['type'], type=type_,) for b in event.pop('related_bills', []): item = e.add_agenda_item( b.pop('description', b.pop('+description', None))) item.add_bill(bill=b['bill_id'], note=b.pop('type', b.pop('+type', None))) seen_documents = set([]) for document in event.pop('documents', []): if document['url'] in seen_documents: print("XXX: Buggy data in: Duped Document URL: %s (%s)" % ( document['url'], document['name'] )) continue seen_documents.add(document['url']) e.add_document(url=document['url'], note=document['name']) assert event == {}, "Unknown fields: %s" % ( ", ".join(event.keys()) ) yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): description = None when = api_event['start'] location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location: continue status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1]: status_text = status_string[1].lower() if any(phrase in status_text for phrase in ( 'rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for', )): status = 'cancelled' elif status_text in ('rescheduled', 'recessed'): status = 'cancelled' elif status_text in ( 'meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting', ): status = api_event['status'] elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time'): status = api_event['status'] elif 'room' in status_text: location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date', ): continue else: print(status_text) description = status_string[1].replace('--em--', '').strip() status = api_event['status'] else: status = api_event['status'] if description: e = Event(name=event["Name"]["label"], start_date=when, description=description, location_name=location, status=status) else: e = Event(name=event["Name"]["label"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if event['Video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Video']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council': participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)': participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2011): other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location']: location_string, note = event[u'Meeting Location'].split( '--em--')[:2] for each in note.split(' - '): if each.startswith('Join'): other_orgs = each else: extras.append(each) else: location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location: continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred', ): status = 'cancelled' elif self.now() < when: status = 'confirmed' else: status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events: continue else: last_events.append(event_id) e = Event(name=event_name, start_time=when, timezone=self.TIMEZONE, description=description, location_name=location, status=status) if extras: e.extras = {'location note': ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower(): participating_orgs = [event["Name"]] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) if agenda: e.add_source(event["Meeting Details"]['url']) for item, _, _ in agenda: if item["Name"]: agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"]: if item['Action']: note = item['Action'] else: note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else: e.add_source(self.EVENTSPAGE) yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(since_datetime=n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") if web_event['Published minutes'] != 'Not\xa0available': e.add_document(note=web_event['Published minutes']['label'], url=web_event['Published minutes']['url'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue # Sometimes if there is an issue getting the Spanish # audio created, Metro has the Spanish Audio link # go to the English Audio. # # Pupa does not allow the for duplicate media links, # so we'll ignore the the second media link if it's # the same as the first media link. # # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html', on_duplicate='ignore') if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def _parse_house_floor_xml_legislative_activity(self, xml): """ Parses XML string of House floor updates and yields them in loop. @param xml: XML of field update @type xml: string @return: complete Event object @rtype: Event """ tree = self._xml_parser(xml) congress = tree.xpath('.//legislative_congress')[0].get('congress') house_committees = self._get_current_house_committee_names() for fa in tree.xpath('.//floor_action'): fa_text = fa.xpath('.//action_description')[0].xpath('string()') eastern = pytz.timezone('US/Eastern') dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S') event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')), eastern.localize(dt).astimezone(pytz.utc), 'US/Eastern', '', description=fa_text, classification='floor_update') event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004", note='House Floor', url='http://www.house.gov', coordinates={'latitude': '38.889931', 'longitude': '-77.009003'}) event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')), note="Scraped from the Office of the Clerk, U.S. House of Representatives website.") event.extras['act-id'] = fa.get('act-id') event.extras['unique-id'] = fa.get('unique-id') # bills ai_b = event.add_agenda_item(description='Bills referenced by this update.') for bill in fa.xpath(".//a[@rel='bill']"): bill_name = bill.xpath('string()') ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress), note="Bill was referenced on the House floor.") # publaws ai_p = event.add_agenda_item(description='Public laws referenced by this update.') for law in fa.xpath(".//a[@rel='publaw']"): detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html' ai_p.add_bill(law.xpath('string()'), id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)), note='Law was referenced on the House floor.') # votes ai_v = event.add_agenda_item(description='Votes referenced by this update.') for vote in fa.xpath(".//a[@rel='vote']"): vote_name = vote.xpath('string()') ai_v.add_vote(vote_name, id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress), note='Vote was referenced on the House floor.') # reports for report in fa.xpath(".//a[@rel='report']"): event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html') for name in house_committees: if name.replace('House ', '') in fa_text: event.add_committee(name, id=make_pseudo_id(name=name)) # TODO identify legislators and add them as participants? yield event
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): for event, agenda in self.events() : description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = confirmedOrPassed(when) elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = confirmedOrPassed(when) else : status = confirmedOrPassed(when) if description : e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") if agenda : e.add_source(event['Meeting Details']['url'], note='web') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"] : identifier = item["Record #"]['label'] if identifier.startswith('S'): identifier = identifier[1:] agenda_item.add_bill(identifier) else : e.add_source(self.EVENTSPAGE, note='web') yield e
def get_events(self): "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" # scrape attendance tmpdir = tempfile.mkdtemp() page = self.lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport") members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option') for member in members: post = { "function": "getMemberAttendanceReport", "download": "csv", "exportPublishReportId": 1, "termId": 4, "memberId": member.attrib["value"], "decisionBodyId": 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) if r.headers["content-type"] != "application/vnd.ms-excel": continue attendance_file = open(tmpdir + "/" + member.text + ".csv", "w") attendance_file.write(r.text) attendance_file.close() # scrape events post = { "function": "getMeetingScheduleReport", "download": "csv", "exportPublishReportId": 3, "termId": 4, "decisionBodyId": 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) empty = [] meeting_file = open("meetings.csv", "w") meeting_file.write(r.text) meeting_file.close() with open("meetings.csv", "rb") as csvfile: csvfile = csv.reader(csvfile, delimiter=",") next(csvfile) committee = "" agenda_items = [] for row in csvfile: name = row[0] when = row[2] when = dt.datetime.strptime(when, "%Y-%m-%d") location = row[5] if name != committee: committee = name agenda_items = find_items(committee) e = Event(name=name, session=self.session, when=when, location=location) attendees = find_attendees(tmpdir, row) if len(attendees) == 0: empty.append(row) for attendee in find_attendees(tmpdir, row): e.add_person(attendee) e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport") for item in agenda_items: if item["date"].date() == when.date(): i = e.add_agenda_item(item["description"]) i.add_committee(committee) i["order"] = item["order"] for link in item["links"]: i.add_media_link(link["name"], link["url"], on_duplicate="ignore") if "notes" in item: i["notes"] = [item["notes"]] yield e shutil.rmtree(tmpdir) os.remove("meetings.csv")
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self): for event, agenda in self.events() : description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = confirmedOrPassed(when) elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = confirmedOrPassed(when) else : status = confirmedOrPassed(when) if description : e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") if agenda : e.add_source(event['Meeting Details']['url'], note='web') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"] : agenda_item.add_bill(item["Record #"]['label']) else : e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self, chamber=None, session=None): url = 'http://leg.colorado.gov/content/committees' if not session: session = self.latest_session() self.info('no session specified, using %s', session) chambers = [chamber] if chamber else ['upper', 'lower'] for chamber in chambers: if chamber == 'lower': xpath = '//div/h3[text()="House Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'upper': xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'other': # All the links under the headers that don't contain "House" or "Senate" xpath = '//div/h3[not(contains(text(),"House")) and ' \ 'not(contains(text(),"Senate"))]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath( '//div[contains(@class,"schedule-item-content")]' '/h4/a/@href') for link in hearing_links: try: page = self.lxmlize(link) title = page.xpath( '//header/h1[contains(@class,"node-title")]')[0] title = title.text_content().strip() date_day = page.xpath( '//div[contains(@class,"calendar-date")]')[0] date_day = date_day.text_content().strip() details = page.xpath( '//span[contains(@class, "calendar-details")]')[0] details = details.text_content().split('|') date_time = details[0].strip() location = details[1].strip() if 'Upon Adjournment' in date_time: date = dt.datetime.strptime( date_day, '%A %B %d, %Y') else: date_str = '{} {}'.format(date_day, date_time) date = dt.datetime.strptime( date_str, '%A %B %d, %Y %I:%M %p') agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath( '//td[@data-label="Hearing Item" and @colspan="2"]' ) for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else '' event = Event(name=title, start_date=self._tz.localize(date), location_name=location) if agenda: event.add_agenda_item(agenda) event.add_source(link) bills = page.xpath( '//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() item = event.add_agenda_item("hearing item") item.add_bill(bill_id) yield event except Exception: # TODO: this is awful pass
def scrape(self, chamber=None, session=None): url = 'http://leg.colorado.gov/content/committees' if not session: session = self.latest_session() self.info('no session specified, using %s', session) chambers = [chamber] if chamber else ['upper', 'lower'] for chamber in chambers: if chamber == 'lower': xpath = '//div/h3[text()="House Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'upper': xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'other': # All the links under the headers that don't contain "House" or "Senate" xpath = '//div/h3[not(contains(text(),"House")) and ' \ 'not(contains(text(),"Senate"))]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath('//div[contains(@class,"schedule-item-content")]' '/h4/a/@href') for link in hearing_links: try: page = self.lxmlize(link) title = page.xpath('//header/h1[contains(@class,"node-title")]')[0] title = title.text_content().strip() date_day = page.xpath('//div[contains(@class,"calendar-date")]')[0] date_day = date_day.text_content().strip() details = page.xpath('//span[contains(@class, "calendar-details")]')[0] details = details.text_content().split('|') date_time = details[0].strip() location = details[1].strip() if 'Upon Adjournment' in date_time: date = dt.datetime.strptime(date_day, '%A %B %d, %Y') else: date_str = '{} {}'.format(date_day, date_time) date = dt.datetime.strptime(date_str, '%A %B %d, %Y %I:%M %p') agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath('//td[@data-label="Hearing Item" and @colspan="2"]') for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else '' event = Event(name=title, start_date=self._tz.localize(date), location_name=location ) if agenda: event.add_agenda_item(agenda) event.add_source(link) bills = page.xpath('//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() item = event.add_agenda_item("hearing item") item.add_bill(bill_id) yield event except: pass
def scrape_event_page(self, url, chamber): html = self.get(url).text page = lxml.html.fromstring(html) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = {"txt": val.text_content().strip(), "obj": val} if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf["Date"]["txt"], metainf["Time"]["txt"].replace(".", ""), ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM", # another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub(r"\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**", ]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace("p.m.", "pm") datetime = datetime.replace("Noon", "pm") try: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") except ValueError: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p") where = metainf["Location"]["txt"] title = metainf["Committee"]["txt"] # XXX: Find a better title if chamber == "other": chamber = "joint" event = Event(name=title, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) event.add_source(mi_events) chair_name = metainf["Chair"]["txt"].strip() if chair_name: event.add_participant(chair_name, type="legislator", note="chair") else: self.warning("No chair found for event '{}'".format(title)) event.add_participant(metainf["Committee"]["txt"], type="committee", note="host") agenda = metainf["Agenda"]["obj"] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a item = event.add_agenda_item(description) item.add_bill(bill.text_content()) yield event
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, session=None, start=None, end=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # testimony url, we'll need it later in a loop # testmony query looks gnary but breaks down to: # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129) # $orderby: LastName,FirstName,Organization # $expand: Request # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization, # PresentedDate,FileSize,Topic testimony_url_base = 'http://legislature.maine.gov/backend/' \ 'breeze/data/CommitteeTestimony?' \ '$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and' \ '%20(Request%2FLegislature%20eq%20{})' \ '&$orderby=LastName%2CFirstName%2COrganization&' \ '$expand=Request&$select=Id%2CFileType%2CNamePrefix' \ '%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic' if start is None: start_date = datetime.datetime.now().isoformat() else: start_date = datetime.datetime.strptime(start, "%Y-%m-%d") start_date = start_date.isoformat() # default to 30 days if no end if end is None: dtdelta = datetime.timedelta(days=30) end_date = datetime.datetime.now() + dtdelta end_date = end_date.isoformat() else: end_date = datetime.datetime.strptime(end, "%Y-%m-%d") end_date = end_date.isoformat() bills_by_event = {} bills_url = 'http://legislature.maine.gov/backend/breeze/data/' \ 'getCalendarEventsBills?startDate={}&endDate={}' bills_url = bills_url.format(start_date, end_date) page = json.loads(self.get(bills_url).content) for row in page: bills_by_event.setdefault(row['EventId'], []) bills_by_event[row['EventId']].append(row) # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false url = 'http://legislature.maine.gov/backend/breeze/data/' \ 'getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true' url = url.format(start_date, end_date) page = json.loads(self.get(url).content) for row in page: if row['Cancelled'] is True or row['Postponed'] is True: continue start_date = self._TZ.localize( dateutil.parser.parse(row['FromDateTime'])) end_date = self._TZ.localize( dateutil.parser.parse(row['ToDateTime'])) name = row['CommitteeName'] if name is None: name = row['Host'] address = row['Location'] address = address.replace( 'Cross Building', 'Cross Office Building, 111 Sewall St, Augusta, ME 04330') address = address.replace( 'State House', 'Maine State House, 210 State St, Augusta, ME 04330') event = Event( start_date=start_date, end_date=end_date, name=name, location_name=address, ) event.add_source( 'http://legislature.maine.gov/committee/#Committees/{}'.format( row['CommitteeCode'])) if bills_by_event.get(row['Id']): for bill in bills_by_event[row['Id']]: description = 'LD {}: {}'.format(bill['LD'], bill['Title']) agenda = event.add_agenda_item(description=description) agenda.add_bill('LD {}'.format(bill['LD'])) if bill['TestimonyCount'] > 0: test_url = testimony_url_base.format( bill['PaperNumber'], session) test_page = json.loads(self.get(test_url).content) for test in test_page: title = '{} {} - {}'.format( test['FirstName'], test['LastName'], test['Organization']) if test['NamePrefix'] is not None: title = '{} {}'.format(test['NamePrefix'], title) test_url = 'http://legislature.maine.gov/backend/app/services' \ '/getDocument.aspx?doctype=test&documentId={}'.format(test['Id']) if test['FileType'] == 'pdf': media_type = "application/pdf" event.add_document(note=title, url=test_url, media_type=media_type) yield event
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape_event_page(self, url, chamber): html = self.get(url).text page = lxml.html.fromstring(html) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = { "txt": val.text_content().strip(), "obj": val } if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf['Date']['txt'], metainf['Time']['txt'].replace(".", "") ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM" # another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub(r"\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**"]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace('p.m.', 'pm') datetime = datetime.replace('Noon', "pm") try: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") except ValueError: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title if chamber == 'other': chamber = 'joint' event = Event( name=title, start_date=self._tz.localize(datetime), location_name=where, ) event.add_source(url) event.add_source(mi_events) chair_name = metainf['Chair']['txt'].strip() if chair_name: event.add_participant(chair_name, type='legislator', note='chair') else: self.warning("No chair found for event '{}'".format(title)) event.add_participant(metainf['Committee']['txt'], type='committee', note='host') agenda = metainf['Agenda']['obj'] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a item = event.add_agenda_item(description) item.add_bill(bill.text_content()) yield event
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + datetime.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = datetime.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event['meeting'] e = Event( name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_source(calendar_day_url) e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = agenda_url_template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) yield e
def scrape(self): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath( ".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join( [place_and_time[0].strip(), place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" # when is now of the following format: # Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath( ".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+', " ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event( name=description, description=description, start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=location, classification=event_type, ) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: continue bill = bill[0] bill_desc = b.text_content().replace(bill, "").strip(", ").strip() ses, num = bill.split("-") bill = ses.replace(" ", "") + "-" + num.zfill(4) item = e.add_agenda_item(bill_desc) item.add_bill(bill) e.add_source(calendar_url) if committee: e.add_participant(committee, type='orgnization', note='host') yield e
def scrape(self): for c in house_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') links = c.xpath('.//h3/a/@href') if len(links) > 0: m['cmt'] = c.xpath('.//h3/a/text()')[0] m['link'] = c.xpath('.//h3/a/@href')[0] title = c.xpath('.//h3/text()')[0] if title == 'Agenda:': m['title'] = c.xpath('.//h3/a/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] m['link'] = None info_div = c.xpath('.//*[@class="calendar_p_indent"]') if len(info_div) == 0: pass else: info_div = info_div[0] print('Info Div: ', info_div) if len(info_div) > 0: info_list = info_div.xpath('.//text()') info_links = info_div.xpath('.//*/@href') print("info links: ", info_links) info_list = [x.replace('\n', '').strip() for x in info_list] info_list = [x for x in info_list if len(x) > 0] print('Info list: ', info_list) if info_list[0].startswith('Room:'): m['room'] = info_list[1] else: m['room'] = 'n/a' if len(info_list) > 2: if info_list[2].startswith('Chair:'): chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] else: m['chair'] = None bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr')) print('Bills: ', bill_rows) bills = [] for brs in bill_rows: cells = brs.xpath('.//td') if len(cells) == 3: b = {} b['bill'] = cells[0].xpath('.//text()')[0] b['author'] = cells[1].xpath('./text()')[0] b['summary'] = cells[2].xpath('./text()')[0] bills.append(b) if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') continue m['date'] = datetime.datetime.strptime(date[0], format1) if 'House Meets in Session' in m['title']: m['room'] = 'State leg' m['cmt'] = 'Minnesota House of Representatives' m['chair'] = None m['link'] = 'https://www.leg.state.mn.us/cal?type=all' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room'] ) if len(bills) > 0: for bill in bills: nbill = event.add_agenda_item(description=bill['summary']) nbill.add_bill(bill['bill'].replace('HF', 'HF ')) if len(m['notice']) > 0: pass event.add_committee(m['cmt']) if m['link'] is not None: event.add_source(m['link']) if m['chair'] is not None: for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, web_event in self.events(n_days_ago): when = api_event['start'] location = api_event[u'EventLocation'] extracts = self._parse_comment(api_event[u'EventComment']) description, room, status, invalid_event = extracts if invalid_event: continue if room: location = room + ', ' + location if not status: status = api_event['status'] if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=status) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if web_event['Meeting video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=web_event['Meeting video']['url'], type="recording", media_type='text/html') self.addDocs(e, web_event, 'Published agenda') self.addDocs(e, web_event, 'Notice') self.addDocs(e, web_event, 'Published summary') if 'Captions' in web_event: self.addDocs(e, web_event, 'Captions') participant = api_event["EventBodyName"] if participant == 'City Council': participant = 'Mountain View City Council' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') e.add_source(web_event['Meeting Name']['url'], note='web') yield e