def scrape(self): for page in self.eventPages(EVENTSPAGE): events_table = page.xpath("//table[@class='rgMasterTable']")[0] for events, headers, rows in self.parseDataTable(events_table) : print(events) location_string = events[u'Meeting\xa0Location'] location_list = location_string.split('--') location = ', '.join(location_list[0:2]) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status = status_string[1].lower() if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] : print(status) status = 'confirmed' else : status = 'confirmed' when = events[u'Meeting\xa0Date'] time_string = events[u'Meeting\xa0Time'] event_time = datetime.datetime.strptime(time_string, "%I:%M %p") when = when.replace(hour=event_time.hour) e = Event(name=events["Name"]["label"], when=when, location=location, status=status) e.add_source(EVENTSPAGE) if events['Video'] != u'Not\xa0available' : print(events['Video']) yield e
def scrape_committee_events(self, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.get(events_url).text events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: if info['title'] is None: self.warning("Event found with no title; it will be skipped") continue elif info['title'].startswith('CANCELLED:'): self.info("Cancelled event found; it will be skipped: {}". format(info['title'])) continue when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT) # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT) where = "{0} {1}".format(info['building'].strip(), info['location'].strip()) # end_time=self._tz.localize(end), event = Event(start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=where, name=info['title'], description=info['title'],) event.add_source(events_url) yield event
def scrape_upper(self): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split('MEETING NOTICES') re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}' chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1) time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM') time_ = time.strptime(time_, '%I:%M %p') when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1) where = where.strip() event = Event(name=title, start_date=self._tz.localize(when), location_name=where) event.add_source(url) yield event
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event(name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def event_obj(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', location_name="Joe's Place", ) e.add_source(url='http://example.com/foobar') return e
def ge(): event = ScrapeEvent( name="America's Birthday", start_time="2014-07-04T05:00Z", location_name="America", timezone="America/New_York", all_day=True) event.add_person("George Washington") return event
def event_obj(): e = Event( name="get-together", start_time=datetime.datetime.utcnow(), location_name="Joe's Place", timezone="America/New_York", ) e.add_source(url='foobar') return e
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize(datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath('text()')[0], url=url, media_type='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape_event_page(self, event): url = event.attrib['href'] page = self.lxmlize(url) title = page.xpath("//h2[@class='evlist_header']") title = title[0].text.strip() if title else None if title is None: return if "CANCELED" in title: return info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0] blocks = info.xpath(".//div") ret = {} for block in blocks: els = block.xpath("./*") if not els: continue le = els[0] if le.tag != 'label': continue label, div = els ltex = label.text_content().strip() dtex = div.text_content().strip() ret[ltex] = dtex when = dt.datetime.utcnow() date, start, end = (x.strip() for x in ret['When:'].split("\n")) start = re.sub("^@", "", start).strip() end = end.replace("-", "").strip() replace = [ ('Apr', 'April'), ] skip = ["Occurs every"] for k, v in replace: date = date.replace(k, v).strip() if True in (x in end for x in skip): return start = "%s %s" % (date, start) end = "%s %s" % (date, end) start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end)) event = Event( name=title, location=ret['Where:'], when=start, end=end) event.add_source(url) yield event
def scrape_chamber(self, chamber): url = utils.urls['events'][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath('tr'): time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = row.xpath( 'td[@class="CMS-MeetingDetail-Agenda"]/div/div' )[-1].text_content().strip() location = row.xpath( 'td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip() committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a') bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_time = datetime.datetime.strptime( '{} {}'.format(date_string, time_string), '%m/%d/%Y %I:%M %p', ) except ValueError: break event = Event( name=description, start_time=self._tz.localize(start_time), location_name=location, timezone=self._tz.zone, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn'])) for committee in committees: parsed = urllib.parse.urlparse(committee.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r' \([S|H]\)$', '', committee.text), id=qs.get('Code'), ) yield event
def categorize_data(self, csv_data): return_objs = [] Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_')) for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line if not line: continue # cur_obj will be the person or organization that made the contribution cur_obj = None contribution = Contribution(*line.split(',')) if contribution.Contributor_Type in self.business_contribution_types: cur_obj = Organization(contribution.Contributor_Name) elif contribution.Contributor_Type in self.individual_contribution_types: cur_obj = Person(contribution.Contributor_Name) elif contribution.Contributor_Type == 'Unknown/Anonymous': if contribution.Contributor_Name: #ignoring un-named contributors #these look like catch-all business contributions cur_obj = Organization(contribution.Contributor_Name) if cur_obj: #we don't set cur_obj in the event that there was an #anonymous/unknown contribution without a Contribution_Name #so we need to check that it exists before adding to it cur_obj.add_source(url=self.search_url) cur_obj.source_identified = True if contribution.Contributor_Address: cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address) if contribution.Employer_Name: cur_obj.extras['Employer'] = contribution.Employer_Name if contribution.Employer_Occupation: cur_obj.extras['Occupation'] = contribution.Employer_Occupation #recipiant_obj is the organization that received the contribution recipiant_obj = Organization(contribution.Receiving_Committee) recipiant_obj.extras['Office'] = contribution.Office recipiant_obj.extras['Filing Period'] = contribution.Filing_Period recipiant_obj.extras['Fundtype'] = contribution.Fundtype #transaction is the event linking the donor and recipiant transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD transaction.extras['Contribution Amount'] = contribution.Contribution_Amount transaction.extras['Contribution Type'] = contribution.Contribution_Type transaction.add_source(url=self.search_url) #transaction.source_identified = True transaction.participants.append(cur_obj.as_dict()) transaction.participants.append(recipiant_obj.as_dict()) yield (cur_obj, recipiant_obj, transaction) else: yield []
def scrape(self): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue name = " ".join( x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip() ) # Skip events with no name if not name: continue event = Event( start_date=self._TZ.localize( datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT, ) ), name=name, location_name=doc.xpath( '//div[@class="heading-container"]/span/text()' )[0].title() ) event.add_participant( info.xpath('span[@class="col01"]/text()')[0].title(), type='committee', note='host', ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) yield event
def scrape(self, session=None, chamber=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO(page.text), delimiter='|') for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += next(page) desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') when = self._tz.localize(when) # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue description = "%s MEETING" % comm event = Event( name=description, start_time=when, location_name=location, description=description, timezone=self._tz.zone ) event.add_source(url) event.add_participant(comm, type='committee', note='host') # time = row[3].strip() # if time in TIMECODES: # event['notes'] = TIMECODES[time] yield event
def scrape(self): curdate = None page = self.lxmlize(CAL_PAGE) for el in page.xpath("//div[@id='Section1']/*"): if el.tag[0] == 'h': when = WHEN.findall(el.text_content()) when = when[0] if when else None if when is None: continue curdate = " ".join(when) if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'): els = el.xpath("./*") agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]") agenda = agenda[0] if agenda else None if agenda is None: continue info = self.cleanup(el.text_content()) when = DT.findall(info) when = when[0] if when else None if when is None: continue people = el.xpath(".//personname") places = el.xpath(".//place") time, ampm = when if curdate is None: self.warning("Can't scrape, since I don't know what date it is") continue tbuf = " ".join([curdate, time, ampm]) obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p") try: _, where = info.rsplit(u"–", 1) except ValueError: continue where = where.replace(u" ", " ") where = re.sub("\s+", " ", where).strip() where = re.sub("agenda$", "", where).strip() event = Event(name=info, when=obj, location=where) event.add_source(CAL_PAGE) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo['COMMITTEE'] where = metainfo['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo['PLACE'] = where.strip() metainfo['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type='committee', note='host') if chair is not None: event.add_participant(chair, type='legislator', note='chair') for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape_events(self, chamber, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") # some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text # type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) if where is None or where == "": where = 'State House' event = Event(name=description, start_date=when, location_name=where) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant(name, type='committee', note='host') event.add_source(url) yield event else: # hack so we dont fail on the first id numbers where there are some gaps # between the numbers that work and not. if event_id > 1700: raise Exception("Parsing is done we are on future ids that are not used yet.")
def categorize_data(self, csv_data): #Is there a better place to define this? return_objs = [] Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_')) for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line if not line: continue cur_obj = None try: contribution = Contribution(*line.split(',')) except Exception as e: import pdb; pdb.set_trace() if contribution.Contributor_Type in self.business_contribution_types: cur_obj = Organization(contribution.Contributor_Name) elif contribution.Contributor_Type in self.individual_contribution_types: cur_obj = Person(contribution.Contributor_Name) elif contribution.Contributor_Type == 'Unknown/Anonymous': if contribution.Contributor_Name: #ignoring un-named contributors #these look like catch-all business contributions cur_obj = Organization(contribution.Contributor_Name) if cur_obj: cur_obj.add_source(url=self.search_url) cur_obj.source_identified = True if contribution.Contributor_Address: cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address) if contribution.Employer_Name: cur_obj.extras['Employer'] = contribution.Employer_Name if contribution.Employer_Occupation: cur_obj.extras['Occupation'] = contribution.Employer_Occupation recipiant_obj = Organization(contribution.Receiving_Committee) recipiant_obj.extras['Office'] = contribution.Office recipiant_obj.extras['Filing Period'] = contribution.Filing_Period recipiant_obj.extras['Fundtype'] = contribution.Fundtype transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD transaction.extras['Contribution Amount'] = contribution.Contribution_Amount transaction.extras['Contribution Type'] = contribution.Contribution_Type transaction.add_source(url=self.search_url) #transaction.source_identified = True transaction.participants.append(cur_obj.as_dict()) transaction.participants.append(recipiant_obj.as_dict()) yield (cur_obj, recipiant_obj, transaction) else: yield []
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = session[5:] url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.\ format(year_slug) json_data = self.get(url).text events = json.loads(json_data)['data'] for info in events: # Determine when the committee meets if info['TimeSlot'] == '1': start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y') all_day = True else: try: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['TimeSlot'], '%A, %B %d, %Y, %I:%M %p' ) except ValueError: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['StartTime'], '%A, %B %d, %Y, %I:%M %p' ) all_day = False event = Event( start_time=self.TIMEZONE.localize(start_time), timezone='America/New_York', all_day=all_day, name="Meeting of the {}".format(info['LongName']), description="committee meeting", location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr']) ) event.add_source(url) event.add_committee( name=info['LongName'], note='host' ) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", {"chamber": "unknown", "name": committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill( bill['bill_id'], note=bill['type'] ) yield event
def scrape(self): local_timezone = pytz.timezone("US/Eastern") base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp" #things get messy more than a few months out #so we're just pulling 3 months. If we want three #more, they are called "nxx", "nxy" and "nxz" months = ["cur","nex","nxw"] for m in months: doc = self.lxmlize(base_calendar_url + "?next={}".format(m)) events = doc.xpath("//table[contains(@style,'dotted #ccc')]") for event in events: rows = event.xpath(".//tr") for row in rows: heading, data = row.xpath(".//td") h = heading.text_content().lower().replace(":","").strip() if h == "event": title = data.text_content() link = data.xpath(".//a")[0].attrib["href"] elif h == "event date": when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p') when = local_timezone.localize(when) elif h == "location": where = data.text elif h == "description": description = data.text if not description: description = "" status = "confirmed" if "cancelled" in title.lower(): status = "cancelled" e = Event(name=title, start_time=when, timezone="US/Eastern", location_name=where, description=description, status=status) e.add_source(link) yield e
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def scrape_upper_events(self): url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar" page = self.get(url).text feed = feedparser.parse(page) for entry in feed['entries']: # The feed breaks the RSS standard by making the pubdate the # actual event's date, not the RSS item publish date when = datetime.datetime(*entry['published_parsed'][:6]) when = pytz.utc.localize(when) desc = entry['summary'].split(' - ')[0] location = entry['summary'].split(' - ')[1] event = Event(name=desc, start_date=when, description=desc, location_name=location) event.add_source(entry['link']) yield event
def scrape(self): start = dt.datetime.utcnow() start = start - dt.timedelta(days=10) end = start + dt.timedelta(days=30) url = URL.format(**{"from": start.strftime("%Y/%m/%d"), "til": end.strftime("%Y/%m/%d")}) page = self.lxmlize(url) events = page.xpath("//ul[contains(@class, 'committee-events')]//li") for event in events: string = event.text_content() po = CLICK_INFO.match(event.xpath(".//span")[0].attrib["onclick"]) if po is None: continue poid = po.groupdict()["info_id"] # This is used to get more deetz on popage = self.popOverUrl(poid) when = dt.datetime.strptime(popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p") who = popage.xpath("//h1")[0].text related = [] for item in popage.xpath("//div"): t = item.text if t is None: continue t = t.strip() for related_entity in ORD_INFO.findall(t): related.append({"ord_no": related_entity, "what": t}) e = Event(name=who, when=when, location="unknown") e.add_source(url) for o in related: i = e.add_agenda_item(o["what"]) i.add_bill(o["ord_no"], note="consideration") yield e
def scrape_event(self, href): page = self.lxmlize(href.attrib['href']) what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:] ret = { "Location:": "Unknown" } for tr in info: tds = tr.xpath(".//td") if len(tds) < 2: continue what, data = [tds.pop(0).text_content().strip() for x in range(2)] ret[what] = data agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]") if agendas: for agenda in agendas: print("Agenda:", agenda.attrib['href']) t = ret['Time:'] start_time, end_time = t, None if "-" in t: start_time, end_time = (x.strip() for x in t.split("-", 1)) start_time = "%s %s" % (ret['Date:'], start_time) dts = "%B %d, %Y %I:%M %p" start = dt.datetime.strptime(start_time, dts) end = None if end_time: end = "%s %s" % (ret['Date:'], end_time) end = dt.datetime.strptime(end, dts) kwargs = {} if end: kwargs['end'] = end e = Event(name=what, location=ret['Location:'], when=start, **kwargs) e.add_source(href.attrib['href']) yield e
def scrape_event(self, row): date_td = row.xpath('td[1]')[0] info_td = row.xpath('td[2]')[0] date = date_td.xpath('b')[0].text.strip() time = date_td.xpath('b/following-sibling::text()')[0].strip() date_and_time = "{} {}".format(date, time) start_date = datetime.datetime.strptime( date_and_time, '%m/%d/%y %I:%M %p') title = info_td.xpath('font[1]/strong')[0].text.strip() all_text = info_td.xpath('descendant-or-self::*/text()') notes = (line.strip() for line in all_text if line.strip()) notes = list(notes) # Skip the first line, which is the title notes = notes[1:] # Split out the address address = notes[0] notes = notes[1:] # The rest just becomes the description notes = "\n".join(notes) event = Event( start_date=self._TZ.localize(start_date), name=title, location_name=address, description=notes ) event.add_source(self.URL) if info_td.xpath('a[contains(font/text(),"agenda")]'): agenda_url = info_td.xpath('a/@href')[0] event.add_document( "Agenda", url=agenda_url ) yield event
def scrape(self): url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" page = self.lxmlize(url) for entry in page.xpath( "//tr[@style='font-family: Verdana; font-size: 12px;']"): name, when, links = entry.xpath(".//td") name = name.text.strip().replace(u"\xc2\xa0", "") when = when.text.strip().replace(u"\xc2\xa0", "") when = dt.datetime.strptime(when, "%m/%d/%Y") links = links.xpath(".//a") links = {x.text: x.attrib['href'] for x in links} e = Event(name=name, when=when, location='unknown') e.add_source(url) for note, url in links.items(): e.add_link(note=note, url=url) yield e
def scrape_upper(self): listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm' html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split('<hr />')[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, 'Date:') when_time = self.row_content(page, 'Time:') location = self.row_content(page, 'Room:') location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # com = self.row_content(page, 'Committee:') com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0] com = com.split(', Senator')[0].strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source(listing_url) event.add_participant( com, type='committee', note='host', ) for bill_table in page.xpath('//table[@width="85%" and @border="0"]'): bill_link = '' if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath('string(tr[2])').strip() agenda_item = event.add_agenda_item(description=agenda_line) bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath('string(tr[1])').strip() agenda_item = event.add_agenda_item(description=agenda_line) yield event
def test_full_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America", all_day=True) event.add_person("George Washington") event.add_media_link("fireworks", "http://example.com/fireworks.mov") EventImporter('jid').import_data([event.as_dict()])
def scrape(self): EVENTS_URL = 'http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx' rows = self.lxmlize(EVENTS_URL).xpath( '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr') for row in rows[1:]: date = row.xpath('td')[0].text_content().strip() time = row.xpath('td')[1].text_content().strip() date_with_time = '{} {}'.format(date, time) location = row.xpath('td')[2].text_content().strip() # 11 South Union Street, Montgomery, Alabama, United States # TODO: IF location is "room (X)" add state house # TODO: REplace "state house" with address # 32°22′37.294″N 86°17′57.991″W # host = row.xpath('td')[3].text_content().strip() name = row.xpath('td')[3].text_content().strip() details = row.xpath('td')[4].text_content().strip() event = Event( start_date=self._TZ.localize( datetime.datetime.strptime( date_with_time, self._DATETIME_FORMAT, ) ), name=name, location_name=location, description=details ) event.add_source(EVENTS_URL) yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item["CommitteeName"]) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_source(url) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item["CommitteeMeetingId"])) event.add_source(page_url) page_data = self.post(page_url).json()["Data"] for item in page_data: event.add_agenda_item(description=str(item["ItemDescription"])) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape(self, chamber=None, session=None): url = "http://leg.colorado.gov/content/committees" if not session: session = self.latest_session() self.info("no session specified, using %s", session) chambers = [chamber] if chamber else ["upper", "lower"] for chamber in chambers: if chamber == "lower": xpath = ( '//div/h3[text()="House Committees of Reference"]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) elif chamber == "upper": xpath = ( '//div/h3[text()="Senate Committees of Reference"]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) elif chamber == "other": # All the links under the headers that don't contain "House" or "Senate" xpath = ( '//div/h3[not(contains(text(),"House")) and ' 'not(contains(text(),"Senate"))]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath( '//div[contains(@class,"schedule-item-content")]' "/h4/a/@href" ) for link in hearing_links: try: page = self.lxmlize(link) title = page.xpath( '//header/h1[contains(@class,"node-title")]' )[0] title = title.text_content().strip() date_day = page.xpath( '//div[contains(@class,"calendar-date")]' )[0] date_day = date_day.text_content().strip() details = page.xpath( '//span[contains(@class, "calendar-details")]' )[0] details = details.text_content().split("|") date_time = details[0].strip() location = details[1].strip() if "Upon Adjournment" in date_time: date = dt.datetime.strptime(date_day, "%A %B %d, %Y") else: date_str = "{} {}".format(date_day, date_time) date = dt.datetime.strptime( date_str, "%A %B %d, %Y %I:%M %p" ) agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath( '//td[@data-label="Hearing Item" and @colspan="2"]' ) for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else "" event = Event( name=title, start_date=self._tz.localize(date), location_name=location, ) if agenda: event.add_agenda_item(agenda) event.add_source(link) bills = page.xpath('//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() item = event.add_agenda_item("hearing item") item.add_bill(bill_id) yield event except Exception: # TODO: this is awful pass
def scrape(self): for c in house_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') links = c.xpath('.//h3/a/@href') if len(links) > 0: m['cmt'] = c.xpath('.//h3/a/text()')[0] m['link'] = c.xpath('.//h3/a/@href')[0] title = c.xpath('.//h3/text()')[0] if title == 'Agenda:': m['title'] = c.xpath('.//h3/a/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] m['link'] = None info_div = c.xpath('.//*[@class="calendar_p_indent"]') if len(info_div) == 0: pass else: info_div = info_div[0] print('Info Div: ', info_div) if len(info_div) > 0: info_list = info_div.xpath('.//text()') info_links = info_div.xpath('.//*/@href') print("info links: ", info_links) info_list = [x.replace('\n', '').strip() for x in info_list] info_list = [x for x in info_list if len(x) > 0] print('Info list: ', info_list) if info_list[0].startswith('Room:'): m['room'] = info_list[1] else: m['room'] = 'n/a' if len(info_list) > 2: if info_list[2].startswith('Chair:'): chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] else: m['chair'] = None bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr')) print('Bills: ', bill_rows) bills = [] for brs in bill_rows: cells = brs.xpath('.//td') if len(cells) == 3: b = {} b['bill'] = cells[0].xpath('.//text()')[0] b['author'] = cells[1].xpath('./text()')[0] b['summary'] = cells[2].xpath('./text()')[0] bills.append(b) if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') continue m['date'] = datetime.datetime.strptime(date[0], format1) if 'House Meets in Session' in m['title']: m['room'] = 'State leg' m['cmt'] = 'Minnesota House of Representatives' m['chair'] = None m['link'] = 'https://www.leg.state.mn.us/cal?type=all' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room'] ) if len(bills) > 0: for bill in bills: nbill = event.add_agenda_item(description=bill['summary']) nbill.add_bill(bill['bill'].replace('HF', 'HF ')) if len(m['notice']) > 0: pass event.add_committee(m['cmt']) if m['link'] is not None: event.add_source(m['link']) if m['chair'] is not None: for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2017): other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location']: location_string, note = event[u'Meeting Location'].split( '--em--')[:2] for each in note.split(' - '): if each.startswith('Join'): other_orgs = each else: extras.append(each) else: location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location: continue when = self.toTime(event[u'Meeting Date']) response = self.get(event['iCalendar']['url'], verify=False) event_time = self.ical( response.text).subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred', ): status = 'cancelled' elif self.now() < when: status = 'confirmed' else: status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events: continue else: last_events.append(event_id) e = Event(name=event_name, start_time=when, timezone=self.TIMEZONE, description=description, location_name=location, status=status) if extras: e.extras = {'location note': ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower(): participating_orgs = [event["Name"]] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) if agenda: e.add_source(event["Meeting Details"]['url'], note='web') for item, _, _ in agenda: if item["Name"]: agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"]: if item['Action']: note = item['Action'] else: note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else: e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape_chamber(self, chamber): """ Scrape upper or lower committee agendas """ # session = self.latest_session() # since we are scraping only latest_session # session_id = self.session_metadata.session_id_meta_data[session] # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % self._chamber_short[chamber] html_ = self.get(url).text doc = html.fromstring(html_) if chamber == 'upper': event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/' 'tr/td/table/tr/td/table')[0] else: event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [x.text_content().strip() for x in row.xpath('td')] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) agenda_info = self.parse_agenda(chamber, link) description = agenda_info['description'] member_list = agenda_info['member_list'] related_bills = agenda_info['related_bills'] print(related_bills) """ event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description, related_bills=related_bills) """ event = Event(location_name=room, start_date=self._tz.localize(when), name=title, description=description, ) event.add_participant(committee, type='committee', note='host') event.participants.extend(member_list) event.add_source(url) event.add_source(link) # print event['when'].timetuple() # import ipdb;ipdb.set_trace() yield event
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize( datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol') event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document(note=document.xpath('text()')[0], url=url, media_type='application/pdf') bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): current_date = datetime.today() current_month = current_date.month current_year = current_date.year date_range = [] print(current_month) for x in range(0, 4): if not current_month == 12: cm = current_month if len(str(cm)) < 2: cm = '0{0}'.format(cm) timestamp = "{0}-{1}".format(current_year, cm) date_range.append(timestamp) current_month += 1 elif current_month == 12: cm = '12' timestamp = "{0}-{1}".format(current_year, cm) date_range.append(timestamp) current_month = 1 current_year += 1 format1 = "%A %B %d, %Y - %I:%M %p" format2 = "%A %B %d, %Y - " format3 = "%m/%d/%y" for date in date_range: root = requests.get("https://www.stpaul.gov/calendar/" + date) base = html.fromstring(root.text) items = base.xpath('.//*/div[@class="view-content"]/div') meetings = [] for i in items: if len( i.xpath( './/*/span[@class="date-display-single"]/text()') ) > 0: d = {} d['date'] = i.xpath( './/*/span[@class="date-display-single"]/text()')[0] d['info'] = i.xpath( './/*/span[@class="field-content"]/a/text()')[0] d['link'] = i.xpath( './/*/span[@class="field-content"]/a/@href')[0] meetings.append(d) for m in meetings: m['link'] = "https://www.stpaul.gov" + m['link'] for m in meetings: ppr(m['info']) r = requests.get(m['link']) b = html.fromstring(r.text) exists = b.xpath('.//div[@class="node-content clearfix"]') if len(exists) > 0: date = exists[0].xpath( './/*/span[@class="date-display-single"]/text()') loc1 = exists[0].xpath( './/*/div[@class="thoroughfare"]/text()') loc2 = exists[0].xpath('.//*/div[@class="premise"]/text()') if len(loc1) > 0: m['location'] = loc1[0] if len(loc2) > 0: m['location'] = m['location'] + " " + loc2[0] else: m['location'] = 'N/A' if ":" in date[0]: m['date'] = datetime.strptime(date[0], format1) elif "/" in date[0]: new_date = date[0].split('/') for n in new_date: if len(n) == 1: n = '0' + n new_date = '/'.join(new_date) m['date'] = datetime.strptime( new_date, format3) else: date = datetime.strptime(date[0], format2) m['date'] = date m['date'] = tz.localize(m['date']) if not 'City Council' in m[ 'info'] and not 'Legislative' in m[ 'info'] and not 'Holiday' in m['info']: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) m['name'] = m['info'].replace('Meeting', '').replace( ' - Cancelled', '').replace('Events', '').strip() event.add_committee(m['name']) elif 'Holiday' in m['info']: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) else: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) event.add_committee('Saint Paul City Council') event.add_source(m['link']) yield event
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')) : description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Multimedia']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting' : participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower() : participating_orgs = [e.name] else : participating_orgs = [] if other_orgs : other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs : e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e