Exemplo n.º 1
0
    def scrape(self, chamber, session):
        url = "http://www.legislature.state.oh.us/today.cfm"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"):
                date = td.text.strip()

                if chamber == 'upper':
                    desc = td.getnext().text.strip()
                else:
                    desc = td.getnext().getnext().text.strip()

                match = re.match(r'^Session at (\d+:\d+ [pa]\.m\.)', desc)
                if match:
                    time = match.group(1)
                    time = time.replace('a.m.', 'AM').replace('p.m.', 'PM')

                    when = "%s 2011 %s" % (date, time)
                    when = datetime.datetime.strptime(when,
                                                      "%a. %b %d %Y %I:%M %p")
                    when = self._tz.localize(when)

                    chamber_name = {
                        'upper': 'Senate',
                        'lower': 'House'
                    }[chamber]

                    event = Event(session, when, 'floor_time', desc,
                                  "%s Chamber" % chamber_name)
                    event.add_source(url)
                    self.save_event(event)
Exemplo n.º 2
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'Meeting_Notice')]"):
            comm = link.text.strip()
            comm = re.sub(r'\s+', ' ', comm)

            if link.getnext().text == 'Cancelled':
                continue

            date_path = "../../preceding-sibling::p[@class='MsoNormal']"
            date = link.xpath(date_path)[-1].xpath("string()")

            time_loc = link.xpath("../br")[0].tail.strip()
            time = re.match("\d+:\d+ (am|pm)", time_loc).group(0)
            location = time_loc.split(', ')[1].strip()

            dt = "%s %s" % (date, time)
            dt = datetime.datetime.strptime(dt, "%A, %B %d, %Y %I:%M %p")

            event = Event(session, dt, 'committee:meeting',
                          "%s Committee Meeting" % comm,
                          location)
            event.add_source(url)
            self.save_event(event)
Exemplo n.º 3
0
 def scrape(self, chamber, session):
     if chamber != "other":
         return
     url = "http://www.leg.state.vt.us/HighlightsMain.cfm"
     page = self.lxmlize(url)
     ps = page.xpath(
         "//p[@class='HighlightsNote' or @class='HighlightsDate']")
     events = {}
     event_set = []
     for p in ps:
         if p.attrib['class'] == "HighlightsNote":
             event_set.append(p)
         else:
             date_time = p.text[len("Posted "):]
             events[date_time] = event_set
             event_set = []
     for date in events:
         date_time = dt.datetime.strptime(date, "%m/%d/%Y")
         for event in events[date]:
             descr = event.text_content()
             e = Event(session,
                       date_time,
                       "other",
                       descr,
                       location="state house")
             e.add_source(url)
             self.save_event(e)
Exemplo n.º 4
0
    def scrape_upper_events(self, session):
        url = ("http://www.nysenate.gov/calendar/ical/"
               "senator%3DAll%2526type%3D3%2526committee%3DAll"
               "%2526initiative%3DAll")

        with self.urlopen(url) as page:
            cal = icalendar.Calendar.from_string(page)

            for comp in cal.walk():
                if comp.name != 'VEVENT':
                    continue

                text = str(comp['SUMMARY'])
                if 'Committee Meeting' not in text:
                    continue

                start = _tz.localize(comp['DTSTART'].dt)
                end = _tz.localize(comp['DTEND'].dt)
                uid = str(comp['UID'])
                event_url = comp['URL']

                location = self.get_upper_location(event_url)
                print location

                event = Event(session, start, 'committee:meeting', text,
                              location, end)
                event.add_source(url)
                event.add_source(event_url)

                self.save_event(event)
Exemplo n.º 5
0
    def scrape(self, chamber, session):
        bills_discussed = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            bills_discussed[(location, date)].append(hearing.bill_id)

        for ((location, date), bills) in bills_discussed.iteritems():
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bills
            ]

            desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location,
                                                             ', '.join(bills))

            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_participant('committee', location)

            self.save_event(event)
Exemplo n.º 6
0
    def scrape_upper_events(self, session):
        url = "http://flsenate.gov/Session/DailyCalendarRSS.cfm?format=rss"
        with self.urlopen(url) as page:
            feed = feedparser.parse(page)

            for entry in feed['entries']:
                if 'Committee' not in entry['summary']:
                    continue

                date = datetime.datetime(*entry['updated_parsed'][:6])
                match = re.match(r'(\d+):(\d+)', entry['title'])
                if match:
                    when = datetime.datetime(date.year, date.month, date.day,
                                             int(match.group(1)),
                                             int(match.group(2)), 0)
                    when = self._tz.localize(when)

                    desc = entry['summary'].split(' - ')[0]
                    location = entry['summary'].split(' - ')[1]

                    event = Event(session, when, 'committee:meeting', desc,
                                  location)
                    event.add_source(url)

                    self.save_event(event)
Exemplo n.º 7
0
    def scrape_committee_events(self, session, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info(
                    "Cancelled event found; it will be skipped: {}".format(
                        info['title']))
                continue

            event = Event(
                session=session,
                when=datetime.datetime.strptime(info['start'],
                                                DATETIME_FORMAT),
                end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT),
                type='committee:meeting',
                description=info['title'],
                location="{0} {1}".format(info['building'].strip(),
                                          info['location'].strip()))
            event.add_source(events_url)

            self.save_event(event)
Exemplo n.º 8
0
    def scrape(self, chamber, session):
        if session != '2011 Regular Session':
            raise NoDataForPeriod(session)

        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                event = Event(session,
                              when,
                              'committee:meeting',
                              desc,
                              location=location)
                event.add_source(url)

                self.save_event(event)
Exemplo n.º 9
0
    def scrape(self, chamber, session):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p")
            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            event = Event(session,
                          when,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            event.add_participant('host', desc, 'committee', chamber=chamber)

            self.save_event(event)
Exemplo n.º 10
0
    def scrape_page(self, url, chamber, session):
        page = self.lxmlize(url)
        info_blocks = {
            "canceled": "//div[@class='cancelled']",
            "committee": "//div[@class='titlemeetingtype']",
            "chamber": "//div[@class='titlehouse']",
            "datetime": "//div[@class='datetimelocation']"
        }
        metainf = {}
        for block in info_blocks:
            info = page.xpath(info_blocks[block])
            if info == []:
                continue
            metainf[block] = {"obj": info[0], "txt": info[0].text_content()}

        if 'committee' not in metainf:
            return

        if 'canceled' in metainf:
            return

        obj = metainf['datetime']['obj']
        dates = obj.xpath("./*")
        date_time = obj.text.strip()
        for date in dates:
            if date.tail is not None:
                date_time += " %s" % (date.tail.strip())
        # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol
        splits = ['AM', 'PM']
        date_times = None
        for split in splits:
            if split in date_time:
                date_times = [x.strip() for x in date_time.split(split, 1)]
                date_times[0] += " " + split

        time = date_times[0]
        place = date_times[1]

        committee = metainf['committee']['txt']
        chamber = metainf['chamber']['txt']

        try:
            chamber = {
                "Senate": "upper",
                "Assembly": "lower",
                "Joint": "joint"
            }[chamber]
        except KeyError:
            chamber = 'other'

        # Wednesday, May 23, 2012 10:00 AM
        datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p")
        event = Event(session,
                      datetime,
                      'committee:meeting',
                      committee,
                      location=place)
        event.add_participant('host', committee, chamber=chamber)
        event.add_source(url)
        self.save_event(event)
Exemplo n.º 11
0
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                kwargs = {}
                if group in self.metadata['chambers'].keys():
                    kwargs['chamber'] = group
                agenda = self.get_agenda(meeting)
                if agenda:
                    kwargs['agenda'] = agenda

                # Event prototype is as follows:
                # class Event(SourcedObject):
                #    def __init__(self, session, when, type,
                #                 description, location, end=None, **kwargs)
                event = Event(self.session, when, 'committee:meeting',
                              description, location, **kwargs)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 12
0
def actions_to_events(state):
    for bill in db.bills.find({'state': state}):
        print "Converting %s actions to events" % bill['_id']

        count = 1
        for action in bill['actions']:
            guid = "%s:action:%06d" % (bill['_id'], count)
            count += 1

            event = db.events.find_one({'state': state,
                                        '_guid': guid})

            description = "%s: %s" % (bill['bill_id'], action['action'])
            data = Event(bill['session'], action['date'],
                         'bill:action', description, location=action['actor'],
                         action_type=action['type'])
            data.add_participant('actor', action['actor'])
            data['_guid'] = guid
            data['state'] = state

            if not event:
                data['created_at'] = datetime.datetime.utcnow()
                data['updated_at'] = data['created_at']
                _insert_with_id(data)
            else:
                update(event, data, db.events)
Exemplo n.º 13
0
    def scrape_event(self, chamber, session, obj):
        meeting = obj['data']['meeting']
        date = int(meeting['meetingDateTime'])
        date = dt.datetime.fromtimestamp(date / 1000)
        if str(date.year) not in session:
            return
        description = 'Committee Meeting: ' + meeting['committeeName']
        event = Event(session,
                      date,
                      'committee:meeting',
                      description=description,
                      location=meeting['location'] or 'No location given.')
        event.add_source(obj['url'])
        event.add_participant('chair',
                              meeting['committeeChair'],
                              'legislator',
                              chamber='upper')
        event.add_participant('host',
                              meeting['committeeName'],
                              'committee',
                              chamber='upper')

        rgx = r'([a-z]+)(\d+)'
        for bill in meeting['bills']:
            raw_id = bill['senateBillNo']
            bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups())
            event.add_related_bill(bill_id,
                                   type='bill',
                                   description=bill['summary']
                                   or 'No description given.')
        return event
Exemplo n.º 14
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                guid = link.attrib['href']

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)

                description = 'Committee Meeting: %s' % committee

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=location)
                event.add_participant('committee', committee)
                event['link'] = guid

                self.save_event(event)
Exemplo n.º 15
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])

            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(session,
                          when,
                          'committee:meeting',
                          title,
                          location=where)
            event.add_source(url)

            self.save_event(event)
Exemplo n.º 16
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            if not when.endswith(session[:len("20XX")]):
                continue

            event = Event(session=session,
                          when=datetime.datetime.strptime(when, '%b %d, %Y'),
                          type='committee:meeting',
                          description=title,
                          location='State Capitol')
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                if committee_name.lower().startswith("Senate"):
                    chamber = "upper"
                elif committee_name.lower().startswith("House"):
                    chamber = "lower"
                else:
                    chamber = "joint"
                event.add_participant(type='host',
                                      participant=committee_name,
                                      participant_type='committee',
                                      chamber=chamber)

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)',
                                document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(name=document.xpath('text()')[0],
                                   url=url,
                                   mimetype='application/pdf')
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        event.add_related_bill(
                            bill_name,
                            type='consideration',
                            description='Bill up for discussion')

            self.save_event(event)
Exemplo n.º 17
0
    def scrape(self, chamber, session):
        if session != '27':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year, year2 = None, None
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (session, date1, date2))

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
            for font in page.xpath(path):
                match = re.match(r'^\((H|S)\)(.+)$', font.text)

                chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
                comm = match.group(2).strip().title()

                next_row = font.xpath("../../following-sibling::tr[1]")[0]

                when = next_row.xpath("string(td[1]/font)").strip()
                when = datetime.datetime.strptime(when + " " + year,
                                                  "%b %d  %A %I:%M %p %Y")
                when = self._tz.localize(when)

                where = next_row.xpath("string(td[2]/font)").strip()

                description = "Committee Meeting\n"
                description += comm

                links = font.xpath(
                    "../../td/font/a[contains(@href, 'get_documents')]")
                if links:
                    agenda_link = links[0]
                    print agenda_link
                    event['link'] = agenda_link.attrib['href']

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=where)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 18
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        with self.urlopen(url) as agendas:
            root = html.fromstring(agendas)
            if chamber == 'upper':
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table')[0]
            else:
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table/tr/td/table')[0]
            for row in event_table.xpath('tr')[2:]:
                # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
                # HTML Document, PDF Document for house
                # Agenda Date, Committee, Revised, Cancelled, Time, Room,
                # HTML Document, PDF Document for senate
                text = [x.text_content().strip() for x in row.xpath('td')]
                when, committee = text[0:2]
                if chamber == 'upper':
                    time, room = text[4:6]
                    link = row[6].xpath('string(a/@href)')
                else:
                    time, room = text[5:7]
                    link = row[7].xpath('string(a/@href)')
                if 'NOT MEETING' in time or 'CANCELLED' in time:
                    continue
                time = re.match('(\d+:\d+ (A|P))', time)
                if time:
                    when = "%s %sM" % (text[0], time.group(0))
                    when = datetime.datetime.strptime(when,
                                                      '%m/%d/%Y %I:%M %p')
                else:
                    when = text[0]
                    when = datetime.datetime.strptime(when, '%m/%d/%Y')

                when = self._tz.localize(when)

                title = "Committee Meeting:\n%s %s %s\n" % (
                    self._chamber_long[chamber], committee, room)
                (description, member_list, meeting_type,
                 other) = self.parse_agenda(chamber, link)
                event = Event(session,
                              when,
                              'committee:meeting',
                              title,
                              location=room,
                              link=link,
                              details=description)
                event.add_participant('committee', committee)
                event['participants'].extend(member_list)
                event.add_source(url)
                event.add_source(link)
                self.save_event(event)
Exemplo n.º 19
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)",
                            when_and_where).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 20
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '')
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                                     '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(',') if s] + [None] * 3)[:3]
            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee_name,
                                  'committee',
                                  chamber='lower')
            event.add_document('Agenda',
                               guid,
                               type='agenda',
                               mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 21
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [
                    committee,
                ]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids[committee]
                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host',
                                      committee['name'],
                                      'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name, notice_href, mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(bill['bill_id'],
                                       description=bill['descr'],
                                       type=bill['type'])

            self.save_event(event)
Exemplo n.º 22
0
    def scrape(self, session, chambers):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if time == "Noon":
                time = "12:00pm"

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            try:
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
            except ValueError:
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M %p")

            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            if desc.lower().strip() in ["house convenes", "senate convenes"]:
                continue

            event = Event(session,
                          when,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            if "house" in desc.lower():
                chamber = "lower"
            elif "senate" in desc.lower():
                chamber = "upper"
            elif "joint" in desc.lower():
                chamber = "joint"
            else:
                self.logger.warning("Event %s chamber is unknown, skipping" %
                                    desc)
                continue

            event.add_participant('host', desc, 'committee', chamber=chamber)

            self.save_event(event)
Exemplo n.º 23
0
    def parse_row(self, row, session, chamber):
        dates = row.xpath("./td[@class='dateCell']")
        for date in dates:
            # alright, so we *may* not get a date, in which case the date
            # is the same as the last event.
            cal_date = date.xpath("./span[@class='calendarMonth']")[0]
            cal_day = date.xpath("./span[@class='calendarDay']")[0]
            self.last_month = cal_date.text_content()
            self.last_day = cal_day.text_content()
        time = row.xpath("./td[@class='timeCell']")
        if not time:
            return  # Nada.
        time = time[0]
        time = time.text.strip()
        dt_string = "%s %s %s %s" % (
            self.last_month,
            self.last_day,
            self.year,
            time
        )
        fmt = "%b %d %Y %I:%M %p"
        when = dt.datetime.strptime(dt_string, fmt)
        cells = {
            "event": "eventCell",
            "status": "statusCell",
            "location": "locationCell",
            "transcript": "transcriptCell",
            "video": "videoCell"
        }
        metainf = {}
        for thing in cells:
            mi = row.xpath("./td[@class='" + cells[thing] + "']")
            if mi == []:
                continue
            metainf[thing] = mi[0]

        if metainf['location'].xpath("./*") == []:
            metainf['location'] = self.last_location
        else:
            self.last_location = metainf['location']

        if "Session" in metainf['event'].text_content().strip():
            return  # Nada.

        loc_url = metainf['location'].xpath(".//a")
        loc_url = loc_url[0].attrib['href']
        event = Event(session,
                      when,
                      'committee:meeting',
                      metainf['event'].text_content().strip(),
                      chamber=chamber,
                      location=metainf['location'].text_content().strip(),
                      location_url=loc_url)
        event.add_participant("host", metainf['event'].text_content().strip(),
                              'committee', chamber=chamber)
        self.add_agenda(event, metainf['event'].xpath(".//a")[0].attrib['href'])
        return event
Exemplo n.º 24
0
    def parse_page(self, url, session):
        page = self.lxmlize(url)
        tables = page.xpath("//table[@class='pubhrgtbl']")
        date = None
        ctty = None
        chamber = 'other'
        for table in tables:
            metainf = {}
            rows = table.xpath(".//tr")
            for row in rows:
                tds = row.xpath("./*")
                if len(tds) < 2:
                    continue
                key, value = tds
                if key.tag == 'th':
                    date = key.text_content()
                    date = re.sub("\s+", " ", date)
                    date = re.sub(".*POSTPONED NEW DATE", "", date).strip()
                    ctty = value.xpath(".//strong")[0]
                    ctty = ctty.text_content()

                    chamber = 'other'
                    if "senate" in ctty.lower():
                        chamber = 'upper'
                    if "house" in ctty.lower():
                        chamber = 'lower'
                    if "joint" in ctty.lower():
                        chamber = 'joint'
                elif key.tag == 'td':
                    key = key.text_content().strip()
                    value = value.text_content().strip()
                    value = value.replace(u'\x96', '-')
                    value = re.sub("\s+", " ", value)
                    metainf[key] = value

            time = metainf['Time:']
            repl = {"A.M.": "AM", "P.M.": "PM"}
            for r in repl:
                time = time.replace(r, repl[r])

            time = re.sub("-.*", "", time)
            time = time.strip()

            year = dt.datetime.now().year

            date = "%s %s %s" % (date, year, time)
            datetime = dt.datetime.strptime(date, "%B %m %Y %I:%M %p")
            event = Event(session,
                          datetime,
                          'committee:meeting',
                          metainf['Public Hearing:'],
                          location=metainf['Place:'],
                          contact=metainf['Contact:'],
                          media_contact=metainf['Media Contact:'])
            event.add_source(url)
            event.add_participant('host', ctty, chamber=chamber)
            self.save_event(event)
Exemplo n.º 25
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == 'upper':
            chamber_abbrev = 'S'
        else:
            chamber_abbrev = 'H'

        url = ("http://www.legis.iowa.gov/Schedules/meetingsList"
               "Chamber.aspx?chamber=%s&bDate=%02d/%02d/"
               "%d&eDate=%02d/%02d/%d" %
               (chamber_abbrev, start_date.month, start_date.day,
                start_date.year, end_date.month, end_date.day, end_date.year))

        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)
        for link in page.xpath("//a[contains(@id, 'linkCommittee')]"):
            comm = link.text.strip()
            desc = comm + " Committee Hearing"
            location = link.xpath("string(../../td[3])")

            when = link.xpath("string(../../td[1])").strip()
            if 'cancelled' in when.lower() or "upon" in when.lower():
                continue
            if "To Be Determined" in when:
                continue

            if 'AM' in when:
                when = when.split('AM')[0] + " AM"
            else:
                when = when.split('PM')[0] + " PM"

            junk = ['Reception']
            for key in junk:
                when = when.replace(key, '')

            when = re.sub("\s+", " ", when).strip()
            if "tbd" in when.lower():
                # OK. This is a partial date of some sort.
                when = datetime.datetime.strptime(when,
                                                  "%m/%d/%Y TIME - TBD %p")
            else:
                try:
                    when = datetime.datetime.strptime(when,
                                                      "%m/%d/%Y %I:%M %p")
                except ValueError:
                    when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p")

            event = Event(session, when, 'committee:meeting', desc, location)
            event.add_source(url)
            event.add_participant('host', comm, 'committee', chamber=chamber)
            self.save_event(event)
Exemplo n.º 26
0
    def scrape(self, chamber, session):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.iteritems():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=committee_name)
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                event.add_related_bill(bill_id,
                                       type=type_,
                                       description='consideration')

            event.add_participant('host',
                                  committee_name + ' Committee',
                                  'committee',
                                  chamber=chamber)
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            self.save_event(event)
Exemplo n.º 27
0
    def scrape(self, chamber, session):
        if chamber != "other":
            return None
        page = self.lxmlize(url)
        meetings = page.xpath("//div[@class='Comm_item']")
        for meeting in meetings:
            metas = meeting.xpath(".//b")
            ctty = meeting.xpath(".//a")[0]
            ctty_name = ctty.text_content()
            info = metas[1:]
            datetime = metas[0]
            metainf = {}
            for meta in info:
                header = meta.text_content().strip()
                val = meta.tail
                metainf[header] = val or ""
            datetime = datetime.text_content().strip()
            # Tuesday, June 05, 2012 9:00 AM
            if "Canceled" in datetime:
                continue

            formats = ["%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y"]
            date_time = None

            for fmt in formats:
                try:
                    date_time = dt.datetime.strptime(datetime, fmt)
                except ValueError:
                    pass

            if date_time is None:
                continue

            event = Event(chamber,
                          date_time,
                          'committee:meeting',
                          ctty_name,
                          location=metainf['Room:'] or "State House")
            event.add_source(url)

            chamber = "other"
            chambers = {
                "house": "lower",
                "joint": "joint",
                "senate": "upper",
            }
            for c in chambers:
                if c in ctty_name.lower():
                    chamber = chambers[c]

            event.add_participant('host', ctty_name, chamber=chamber)
            # add chair?

            self.save_event(event)
Exemplo n.º 28
0
    def scrape(self, session, chambers):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(session, comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          name,
                          location=where,
                          link=notice)

            event.add_source(calurl)
            event.add_participant('host',
                                  cttie,
                                  'committee',
                                  chamber=cttie_chamber)
            event.add_document("notice", notice, mimetype='application/pdf')

            for thing in who:
                event.add_participant(thing['title'],
                                      thing['name'],
                                      'legislator',
                                      chamber=cttie_chamber)

            self.save_event(event)
Exemplo n.º 29
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath(
            "//span[@class='heading']")[0].text_content().replace(
                "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      description,
                      location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
Exemplo n.º 30
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../../td[1])").strip()

            when_and_where = link.xpath("string(../../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)