Пример #1
0
    def get_events(self):
        # get list of executive orders
        url = 'http://www.governor.ny.gov/sl2/ExecutiveOrderindex'
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # extract governor's name
        gov = page.xpath("(//div[@class='section-header']/div/div/div/a/div/h2)[1]")[0]
        governor_name = gov.text.lstrip('Governor ')

        # scrape each executive order
        for eo_par in page.xpath("//div[@class='content']/p"):
            for link in eo_par.xpath(".//a"):

                url = link.get('href').lower()
                if url.endswith('.pdf'):
                    continue

                # get date for executive order
                eo_page = self.urlopen(url)
                eo_page = lxml.html.fromstring(eo_page)
                eo_page = re.sub('(\\r*\\n|\W)', ' ', eo_page.xpath('string()').lower())
                eo_page = re.sub('\s+', ' ', eo_page)
                date_par = re.search('(?:g i v e n)(.*)(?:by the governor)', eo_page).groups()[0]
                date_comp = [s.strip() for s in
                             re.match('(?:.*this)(.*)(?:day of)(.*)(?:in the year)(.*)', date_par).groups()]
                eo_date = dt.datetime.strptime(' '.join(
                    (str(Wtn.parse(date_comp[0])), date_comp[1], str(Wtn.parse(date_comp[2])))), '%d %B %Y')

                # build yield object
                eo_number = eo_par.xpath('string()').split(':', 1)[0]
                eo = Event(eo_number, eo_date, 'New York')
                eo.add_person(governor_name, 'governor')
                eo.description = link.text
                eo.add_document(eo_number, url, 'text/html')
                eo.add_source(url)

                yield eo

        # TODO: get list of press statements
Пример #2
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://legistar.council.nyc.gov/Calendar.aspx"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            els = row.xpath(".//td")
            if len(els) <= 2:
                continue  # Odd one-off.

            (name, date, _, time, where, topic, details, agenda, minutes,
             media) = els
            # _ nom's the image of the cal next to the meeting date.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()
            topic = topic.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime(
                    "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            details = details.xpath(".//a[@href]")
            for detail in details:
                event.add_document(detail.text,
                                   detail.attrib['href'],
                                   mimetype='text/html')

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_document(a.text,
                                   a.attrib['href'],
                                   mimetype='application/pdf')

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_document(minute.text,
                                   minute.attrib['href'],
                                   mimetype='application/pdf')

            yield event
Пример #3
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://legistar.council.nyc.gov/Calendar.aspx"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            els = row.xpath(".//td")
            if len(els) <= 2:
                continue  # Odd one-off.

            (name, date, _, time, where, topic, details, agenda, minutes, media) = els
            # _ nom's the image of the cal next to the meeting date.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()
            topic = topic.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name, session=self.session, when=when, location=location)
            event.add_source(url)

            details = details.xpath(".//a[@href]")
            for detail in details:
                event.add_document(detail.text, detail.attrib["href"], mimetype="text/html")

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_document(a.text, a.attrib["href"], mimetype="application/pdf")

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_document(minute.text, minute.attrib["href"], mimetype="application/pdf")

            yield event
Пример #4
0
    def get_events(self):
        # get list of executive orders
        url = 'http://nj.gov/infobank/circular/eoindex.htm'
        page = self.urlopen(url)
        page = lxml_html.fromstring(page)
        page.make_links_absolute(url)

        # state variables for parser
        governor_name = None
        gov_session_name = None

        # parse the table of executive orders
        for eo_row in page.xpath('//table[@border>0]//tr'):

            cols = eo_row.xpath('.//td')

            # extract governor's name
            if len(cols) == 1:
                # remove things like "'s"
                governor_name = re.sub('\W\w\s', ' ', eo_row.xpath('string()'))
                governor_name = re.sub('\\r*\\n|\W', ' ', governor_name)
                governor_name = re.sub('\s+', ' ', governor_name)
                governor_name = re.search("executive order.*governor(.*)administration",
                                          governor_name, re.IGNORECASE).groups()[0].strip()
                gov_session_name = re.sub('\s+', '_', governor_name)

            # extract executive order
            elif len(cols) == 3:
                if self.session == gov_session_name:
                    eo_num = cols[0].xpath('string()').strip()
                    try:
                        float(eo_num)
                    except ValueError:
                        continue

                    eo_title = re.sub('\\r*\\n', ' ', cols[1].xpath('string()'))
                    eo_title = re.sub('\s+', ' ', eo_title)
                    eo_title = re.sub('\[.*pdf.*\]', '', eo_title).strip()
                    if eo_title == '' or eo_title is None:
                        continue

                    eo_date = re.search('([0-9]{1,2}).*/([0-9]{1,2}).*/([0-9]{4}|[0-9]{2})', cols[2].xpath('string()'))
                    if eo_date is None:
                        continue
                    eo_date = '/'.join(eo_date.groups())
                    try:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%y')
                    except ValueError:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%Y')

                    eo_source = cols[0].xpath('.//a')[0].get('href').lower()
                    mime_type = MimeTypes().guess_type(eo_source)[0]
                    if mime_type is None:
                        mime_type = 'text/html'

                    # build yield object
                    eo = Event(eo_num, eo_date, 'New Jersey', gov_session_name)
                    eo.add_person(governor_name, 'governor')
                    eo.description = eo_title
                    eo.add_document(eo_num, eo_source, mime_type)
                    eo.add_source(eo_source)

                    yield eo
Пример #5
0
    def migrate_events(self, state):
        spec = {}
        if state:
            spec['state'] = state

        for entry in self.billy_db.events.find(spec, timeout=False):

            e = Event(
                name=entry['description'],
                when=entry['when'],
                location=entry['location'],
                session=entry['session'],
                updated_at=entry['updated_at'],
                created_at=entry['created_at'],
                type=entry['type'],
            )
            e.identifiers = [{'scheme': 'openstates',
                             'identifier': entry['_id']}]
            e._openstates_id = entry['_id']
            if entry.get('+location_url'):
                e.add_location_url(entry['+location_url'])

            link = entry.get('link', entry.get("+link"))
            if link:
                e.add_link(link, 'link')

            blacklist = ["description", "when", "location", "session",
                         "updated_at", "created_at", "end", "sources",
                         "documents", "related_bills", "state", "+link",
                         "link", "level", "participants", "country",
                         "_all_ids", "type"]

            e.status = entry.get('status')
            typos = {
                "canceled": "cancelled"
            }
            if e.status in typos:
                e.status = typos[e.status]

            for key, value in entry.items():
                if key in blacklist or not value or key.startswith("_"):
                    continue
                e.extras[key] = value

            if entry.get('end'):
                end = entry['end']
                try:
                    end = dt.datetime.fromtimestamp(end)
                except TypeError:
                    pass

                e.end = end

            for source in entry['sources']:
                e.add_source(url=source['url'])

            if e.sources == []:
                continue  # XXX: print warning

            for document in entry.get('documents', []):
                e.add_document(name=document.get('name'),
                               document_id=document.get('doc_id'),
                               url=document['url'],
                               mimetype=document.get(
                                   "mimetype", document.get(
                                       "+mimetype",
                                       "application/octet-stream")))
                # Try to add the mimetype. If it fails, fall back to a generic
                # undeclared application/octet-stream.

            agenda = None
            for bill in entry.get('related_bills', []):
                if agenda is None:
                    agenda = e.add_agenda_item(
                        description="Bills up for Consideration"
                    )

                hcid = _hot_cache.get(bill.get('id', None), None)
                bid = bill['bill_id']
                if bid is None:
                    continue

                agenda.add_bill(bill=bid, id=hcid)

            for who in entry.get('participants', []):
                participant_type = who.get('participant_type', 'committee')
                # I've gone through the backlog of OpenStates data, they are
                # all committees of some sort.

                who_chamber = who.get('chamber')
                if who_chamber is None:
                    for chamber in ["_chamber", "+chamber"]:
                        f = who.get(chamber)
                        if f:
                            who_chamber = f
                            break

                if who_chamber is None:
                    # Freak of nature ...
                    continue

                hcid = _hot_cache.get(who.get('id', None), None)

                e.add_participant(
                    name=who['participant'],
                    type={
                        "committee": "organization",
                        "legislator": "person",
                        "person": "person",
                    }[participant_type],
                    id=hcid,
                    note=who['type'],
                    chamber=who_chamber)

            self.save_object(e)
Пример #6
0
    def get_events(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e