def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" page = self.lxmlize(url) for entry in page.xpath( "//tr[@style='font-family: Verdana; font-size: 12px;']"): name, when, links = entry.xpath(".//td") name = name.text.strip().replace(u"\xc2\xa0", "") when = when.text.strip().replace(u"\xc2\xa0", "") when = dt.datetime.strptime(when, "%m/%d/%Y") links = links.xpath(".//a") links = {x.text: x.attrib['href'] for x in links} e = Event(name=name, session=self.session, when=when, location='unknown') e.add_source(url) for note, url in links.items(): e.add_link(note=note, url=url) yield e
def test_basic_event(): """ test that we can create an event """ e = Event(name="get-together", when=dt.datetime.utcnow(), location="Joe's Place") e.add_source(url='foobar') e.validate() e.add_link("http://foobar.baz") e.add_link("http://foobar.baz", note="foo") e.validate() assert len(e.links) == 2
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://chicago.legistar.com/Calendar.aspx/" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: if "No records were found." in row.text_content(): self.warning("Hum. They don't seem to have events?") continue (name, date, _, time, where, details, notice, agenda, summary, video) = row.xpath(".//td") # _ nom's the image next to the date on the page. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, session=self.session, when=when, location=location) event.add_source(url) agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_link(a.text, a.attrib['href']) summary = summary.xpath(".//a[@href]") for minute in summary: event.add_link(minute.text, minute.attrib['href']) yield event
def migrate_events(self, state): spec = {} if state: spec['state'] = state for entry in self.billy_db.events.find(spec, timeout=False): e = Event( name=entry['description'], when=entry['when'], location=entry['location'], session=entry['session'], updated_at=entry['updated_at'], created_at=entry['created_at'], type=entry['type'], ) e.identifiers = [{'scheme': 'openstates', 'identifier': entry['_id']}] e._openstates_id = entry['_id'] if entry.get('+location_url'): e.add_location_url(entry['+location_url']) link = entry.get('link', entry.get("+link")) if link: e.add_link(link, 'link') blacklist = ["description", "when", "location", "session", "updated_at", "created_at", "end", "sources", "documents", "related_bills", "state", "+link", "link", "level", "participants", "country", "_all_ids", "type"] e.status = entry.get('status') typos = { "canceled": "cancelled" } if e.status in typos: e.status = typos[e.status] for key, value in entry.items(): if key in blacklist or not value or key.startswith("_"): continue e.extras[key] = value if entry.get('end'): end = entry['end'] try: end = dt.datetime.fromtimestamp(end) except TypeError: pass e.end = end for source in entry['sources']: e.add_source(url=source['url']) if e.sources == []: continue # XXX: print warning for document in entry.get('documents', []): e.add_document(name=document.get('name'), document_id=document.get('doc_id'), url=document['url'], mimetype=document.get( "mimetype", document.get( "+mimetype", "application/octet-stream"))) # Try to add the mimetype. If it fails, fall back to a generic # undeclared application/octet-stream. agenda = None for bill in entry.get('related_bills', []): if agenda is None: agenda = e.add_agenda_item( description="Bills up for Consideration" ) hcid = _hot_cache.get(bill.get('id', None), None) bid = bill['bill_id'] if bid is None: continue agenda.add_bill(bill=bid, id=hcid) for who in entry.get('participants', []): participant_type = who.get('participant_type', 'committee') # I've gone through the backlog of OpenStates data, they are # all committees of some sort. who_chamber = who.get('chamber') if who_chamber is None: for chamber in ["_chamber", "+chamber"]: f = who.get(chamber) if f: who_chamber = f break if who_chamber is None: # Freak of nature ... continue hcid = _hot_cache.get(who.get('id', None), None) e.add_participant( name=who['participant'], type={ "committee": "organization", "legislator": "person", "person": "person", }[participant_type], id=hcid, note=who['type'], chamber=who_chamber) self.save_object(e)
def get_events(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e