def get_events(self): # get list of executive orders url = 'http://www.governor.ny.gov/sl2/ExecutiveOrderindex' page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) # extract governor's name gov = page.xpath("(//div[@class='section-header']/div/div/div/a/div/h2)[1]")[0] governor_name = gov.text.lstrip('Governor ') # scrape each executive order for eo_par in page.xpath("//div[@class='content']/p"): for link in eo_par.xpath(".//a"): url = link.get('href').lower() if url.endswith('.pdf'): continue # get date for executive order eo_page = self.urlopen(url) eo_page = lxml.html.fromstring(eo_page) eo_page = re.sub('(\\r*\\n|\W)', ' ', eo_page.xpath('string()').lower()) eo_page = re.sub('\s+', ' ', eo_page) date_par = re.search('(?:g i v e n)(.*)(?:by the governor)', eo_page).groups()[0] date_comp = [s.strip() for s in re.match('(?:.*this)(.*)(?:day of)(.*)(?:in the year)(.*)', date_par).groups()] eo_date = dt.datetime.strptime(' '.join( (str(Wtn.parse(date_comp[0])), date_comp[1], str(Wtn.parse(date_comp[2])))), '%d %B %Y') # build yield object eo_number = eo_par.xpath('string()').split(':', 1)[0] eo = Event(eo_number, eo_date, 'New York') eo.add_person(governor_name, 'governor') eo.description = link.text eo.add_document(eo_number, url, 'text/html') eo.add_source(url) yield eo # TODO: get list of press statements
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://legistar.council.nyc.gov/Calendar.aspx" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: els = row.xpath(".//td") if len(els) <= 2: continue # Odd one-off. (name, date, _, time, where, topic, details, agenda, minutes, media) = els # _ nom's the image of the cal next to the meeting date. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() topic = topic.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime( "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, session=self.session, when=when, location=location) event.add_source(url) details = details.xpath(".//a[@href]") for detail in details: event.add_document(detail.text, detail.attrib['href'], mimetype='text/html') agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_document(a.text, a.attrib['href'], mimetype='application/pdf') minutes = minutes.xpath(".//a[@href]") for minute in minutes: event.add_document(minute.text, minute.attrib['href'], mimetype='application/pdf') yield event
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://legistar.council.nyc.gov/Calendar.aspx" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: els = row.xpath(".//td") if len(els) <= 2: continue # Odd one-off. (name, date, _, time, where, topic, details, agenda, minutes, media) = els # _ nom's the image of the cal next to the meeting date. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() topic = topic.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, session=self.session, when=when, location=location) event.add_source(url) details = details.xpath(".//a[@href]") for detail in details: event.add_document(detail.text, detail.attrib["href"], mimetype="text/html") agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_document(a.text, a.attrib["href"], mimetype="application/pdf") minutes = minutes.xpath(".//a[@href]") for minute in minutes: event.add_document(minute.text, minute.attrib["href"], mimetype="application/pdf") yield event
def get_events(self): # get list of executive orders url = 'http://nj.gov/infobank/circular/eoindex.htm' page = self.urlopen(url) page = lxml_html.fromstring(page) page.make_links_absolute(url) # state variables for parser governor_name = None gov_session_name = None # parse the table of executive orders for eo_row in page.xpath('//table[@border>0]//tr'): cols = eo_row.xpath('.//td') # extract governor's name if len(cols) == 1: # remove things like "'s" governor_name = re.sub('\W\w\s', ' ', eo_row.xpath('string()')) governor_name = re.sub('\\r*\\n|\W', ' ', governor_name) governor_name = re.sub('\s+', ' ', governor_name) governor_name = re.search("executive order.*governor(.*)administration", governor_name, re.IGNORECASE).groups()[0].strip() gov_session_name = re.sub('\s+', '_', governor_name) # extract executive order elif len(cols) == 3: if self.session == gov_session_name: eo_num = cols[0].xpath('string()').strip() try: float(eo_num) except ValueError: continue eo_title = re.sub('\\r*\\n', ' ', cols[1].xpath('string()')) eo_title = re.sub('\s+', ' ', eo_title) eo_title = re.sub('\[.*pdf.*\]', '', eo_title).strip() if eo_title == '' or eo_title is None: continue eo_date = re.search('([0-9]{1,2}).*/([0-9]{1,2}).*/([0-9]{4}|[0-9]{2})', cols[2].xpath('string()')) if eo_date is None: continue eo_date = '/'.join(eo_date.groups()) try: eo_date = dt.datetime.strptime(eo_date, '%m/%d/%y') except ValueError: eo_date = dt.datetime.strptime(eo_date, '%m/%d/%Y') eo_source = cols[0].xpath('.//a')[0].get('href').lower() mime_type = MimeTypes().guess_type(eo_source)[0] if mime_type is None: mime_type = 'text/html' # build yield object eo = Event(eo_num, eo_date, 'New Jersey', gov_session_name) eo.add_person(governor_name, 'governor') eo.description = eo_title eo.add_document(eo_num, eo_source, mime_type) eo.add_source(eo_source) yield eo
def migrate_events(self, state): spec = {} if state: spec['state'] = state for entry in self.billy_db.events.find(spec, timeout=False): e = Event( name=entry['description'], when=entry['when'], location=entry['location'], session=entry['session'], updated_at=entry['updated_at'], created_at=entry['created_at'], type=entry['type'], ) e.identifiers = [{'scheme': 'openstates', 'identifier': entry['_id']}] e._openstates_id = entry['_id'] if entry.get('+location_url'): e.add_location_url(entry['+location_url']) link = entry.get('link', entry.get("+link")) if link: e.add_link(link, 'link') blacklist = ["description", "when", "location", "session", "updated_at", "created_at", "end", "sources", "documents", "related_bills", "state", "+link", "link", "level", "participants", "country", "_all_ids", "type"] e.status = entry.get('status') typos = { "canceled": "cancelled" } if e.status in typos: e.status = typos[e.status] for key, value in entry.items(): if key in blacklist or not value or key.startswith("_"): continue e.extras[key] = value if entry.get('end'): end = entry['end'] try: end = dt.datetime.fromtimestamp(end) except TypeError: pass e.end = end for source in entry['sources']: e.add_source(url=source['url']) if e.sources == []: continue # XXX: print warning for document in entry.get('documents', []): e.add_document(name=document.get('name'), document_id=document.get('doc_id'), url=document['url'], mimetype=document.get( "mimetype", document.get( "+mimetype", "application/octet-stream"))) # Try to add the mimetype. If it fails, fall back to a generic # undeclared application/octet-stream. agenda = None for bill in entry.get('related_bills', []): if agenda is None: agenda = e.add_agenda_item( description="Bills up for Consideration" ) hcid = _hot_cache.get(bill.get('id', None), None) bid = bill['bill_id'] if bid is None: continue agenda.add_bill(bill=bid, id=hcid) for who in entry.get('participants', []): participant_type = who.get('participant_type', 'committee') # I've gone through the backlog of OpenStates data, they are # all committees of some sort. who_chamber = who.get('chamber') if who_chamber is None: for chamber in ["_chamber", "+chamber"]: f = who.get(chamber) if f: who_chamber = f break if who_chamber is None: # Freak of nature ... continue hcid = _hot_cache.get(who.get('id', None), None) e.add_participant( name=who['participant'], type={ "committee": "organization", "legislator": "person", "person": "person", }[participant_type], id=hcid, note=who['type'], chamber=who_chamber) self.save_object(e)
def get_events(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e