Exemplo n.º 1
0
    def scrape(self, chamber, session):
        url = "http://www.legislature.state.oh.us/today.cfm"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"):
                date = td.text.strip()

                if chamber == 'upper':
                    desc = td.getnext().text.strip()
                else:
                    desc = td.getnext().getnext().text.strip()

                match = re.match(r'^Session at (\d+:\d+ [pa]\.m\.)', desc)
                if match:
                    time = match.group(1)
                    time = time.replace('a.m.', 'AM').replace('p.m.', 'PM')

                    when = "%s 2011 %s" % (date, time)
                    when = datetime.datetime.strptime(when,
                                                      "%a. %b %d %Y %I:%M %p")
                    when = self._tz.localize(when)

                    chamber_name = {
                        'upper': 'Senate',
                        'lower': 'House'
                    }[chamber]

                    event = Event(session, when, 'floor_time', desc,
                                  "%s Chamber" % chamber_name)
                    event.add_source(url)
                    self.save_event(event)
Exemplo n.º 2
0
 def scrape(self, chamber, session):
     if chamber != "other":
         return
     url = "http://www.leg.state.vt.us/HighlightsMain.cfm"
     page = self.lxmlize(url)
     ps = page.xpath(
         "//p[@class='HighlightsNote' or @class='HighlightsDate']")
     events = {}
     event_set = []
     for p in ps:
         if p.attrib['class'] == "HighlightsNote":
             event_set.append(p)
         else:
             date_time = p.text[len("Posted "):]
             events[date_time] = event_set
             event_set = []
     for date in events:
         date_time = dt.datetime.strptime(date, "%m/%d/%Y")
         for event in events[date]:
             descr = event.text_content()
             e = Event(session,
                       date_time,
                       "other",
                       descr,
                       location="state house")
             e.add_source(url)
             self.save_event(e)
Exemplo n.º 3
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                guid = link.attrib['href']

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)



                description = 'Committee Meeting: %s' % committee

                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_participant('committee', committee)
                event['link'] = guid

                self.save_event(event)
Exemplo n.º 4
0
    def scrape_committee_events(self, session, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info(
                    "Cancelled event found; it will be skipped: {}".format(
                        info['title']))
                continue

            event = Event(
                session=session,
                when=datetime.datetime.strptime(info['start'],
                                                DATETIME_FORMAT),
                end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT),
                type='committee:meeting',
                description=info['title'],
                location="{0} {1}".format(info['building'].strip(),
                                          info['location'].strip()))
            event.add_source(events_url)

            self.save_event(event)
Exemplo n.º 5
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'Meeting_Notice')]"):
            comm = link.text.strip()
            comm = re.sub(r'\s+', ' ', comm)

            if link.getnext().text == 'Cancelled':
                continue

            date_path = "../../preceding-sibling::p[@class='MsoNormal']"
            date = link.xpath(date_path)[-1].xpath("string()")

            time_loc = link.xpath("../br")[0].tail.strip()
            time = re.match("\d+:\d+ (am|pm)", time_loc).group(0)
            location = time_loc.split(', ')[1].strip()

            dt = "%s %s" % (date, time)
            dt = datetime.datetime.strptime(dt, "%A, %B %d, %Y %I:%M %p")

            event = Event(session, dt, 'committee:meeting',
                          "%s Committee Meeting" % comm,
                          location)
            event.add_source(url)
            self.save_event(event)
Exemplo n.º 6
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                guid = link.attrib['href']

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)

                description = 'Committee Meeting: %s' % committee

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=location)
                event.add_participant('committee', committee)
                event['link'] = guid

                self.save_event(event)
Exemplo n.º 7
0
    def scrape(self, chamber, session):
        bills_discussed = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            bills_discussed[(location, date)].append(hearing.bill_id)

        for ((location, date), bills) in bills_discussed.iteritems():
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bills
            ]

            desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location,
                                                             ', '.join(bills))

            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_participant('committee', location)

            self.save_event(event)
Exemplo n.º 8
0
    def scrape_event(self, chamber, session, obj):
        meeting = obj['data']['meeting']
        date = int(meeting['meetingDateTime'])
        date = dt.datetime.fromtimestamp(date / 1000)
        if str(date.year) not in session:
            return
        description = 'Committee Meeting: ' + meeting['committeeName']
        event = Event(session,
                      date,
                      'committee:meeting',
                      description=description,
                      location=meeting['location'] or 'No location given.')
        event.add_source(obj['url'])
        event.add_participant('chair',
                              meeting['committeeChair'],
                              'legislator',
                              chamber='upper')
        event.add_participant('host',
                              meeting['committeeName'],
                              'committee',
                              chamber='upper')

        rgx = r'([a-z]+)(\d+)'
        for bill in meeting['bills']:
            raw_id = bill['senateBillNo']
            bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups())
            event.add_related_bill(bill_id,
                                   type='bill',
                                   description=bill['summary']
                                   or 'No description given.')
        return event
Exemplo n.º 9
0
    def scrape_lower_events(self, session):
        url = "http://assembly.state.ny.us/leg/?sh=hear"

        year = datetime.date.today().year

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='#99CCCC']"):
                desc = td.xpath("string(following-sibling::td/strong)")
                if "Senate Standing Committee" in desc:
                    # We should pick these up from the upper scraper
                    continue

                notes = td.xpath("string(../following-sibling::tr[1]/td[2])")
                notes = re.sub(r"\*\*Click here to view hearing notice\*\*", "", notes).strip()

                location = td.xpath("string(../following-sibling::tr[2]/td[2])")

                date = " ".join(td.text.split()[0:2]).strip()

                time = td.xpath("../following-sibling::tr[3]/td[2]")[0]
                split_time = time.text.split("-")

                when = "%s %d %s" % (date, year, split_time[0].strip())
                when = _parse_date(when.replace(".", ""))

                end = None
                if len(split_time) > 1:
                    end = "%s %d %s" % (date, year, split_time[1].strip())
                    end = _parse_date(end.replace(".", ""))

                event = Event(session, when, "committee:meeting", desc, location, end=end, notes=notes)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 10
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])

            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(session,
                          when,
                          'committee:meeting',
                          title,
                          location=where)
            event.add_source(url)

            self.save_event(event)
Exemplo n.º 11
0
def actions_to_events(state):
    for bill in db.bills.find({'state': state}):
        print "Converting %s actions to events" % bill['_id']

        count = 1
        for action in bill['actions']:
            guid = "%s:action:%06d" % (bill['_id'], count)
            count += 1

            event = db.events.find_one({'state': state,
                                        '_guid': guid})

            description = "%s: %s" % (bill['bill_id'], action['action'])
            data = Event(bill['session'], action['date'],
                         'bill:action', description, location=action['actor'],
                         action_type=action['type'])
            data.add_participant('actor', action['actor'])
            data['_guid'] = guid
            data['state'] = state

            if not event:
                data['created_at'] = datetime.datetime.utcnow()
                data['updated_at'] = data['created_at']
                _insert_with_id(data)
            else:
                update(event, data, db.events)
Exemplo n.º 12
0
    def scrape(self, chamber, session):
        if chamber == 'upper':
            url = ("http://www.nysenate.gov/calendar/ical/"
                   "senator%3DAll%2526type%3D3%2526committee%3DAll"
                   "%2526initiative%3DAll")
        else:
            return

        with self.urlopen(url) as page:
            cal = icalendar.Calendar.from_string(page)

            for comp in cal.walk():
                if comp.name != 'VEVENT':
                    continue

                text = str(comp['SUMMARY'])
                if 'Committee Meeting' not in text:
                    continue

                start = comp['DTSTART'].dt
                end = comp['DTEND'].dt
                uid = str(comp['UID'])
                event_url = comp['URL']

                location = self.get_upper_location(event_url)
                print location

                event = Event(session, start, 'committee:meeting',
                              text, location, end)
                event.add_source(url)
                event.add_source(event_url)

                self.save_event(event)
Exemplo n.º 13
0
    def scrape(self, chamber, session):
        url = "http://www.legislature.state.oh.us/today.cfm"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"):
                date = td.text.strip()

                if chamber == 'upper':
                    desc = td.getnext().text.strip()
                else:
                    desc = td.getnext().getnext().text.strip()

                match = re.match(r'^Session at (\d+:\d+ [pa]\.m\.)', desc)
                if match:
                    time = match.group(1)
                    time = time.replace('a.m.', 'AM').replace('p.m.', 'PM')

                    when = "%s 2011 %s" % (date, time)
                    when = datetime.datetime.strptime(when,
                                                      "%a. %b %d %Y %I:%M %p")
                    when = self._tz.localize(when)

                    chamber_name = {'upper': 'Senate',
                                    'lower': 'House'}[chamber]

                    event = Event(session, when, 'floor_time', desc,
                                  "%s Chamber" % chamber_name)
                    event.add_source(url)
                    self.save_event(event)
Exemplo n.º 14
0
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                kwargs = {}
                if group in self.metadata["chambers"].keys():
                    kwargs["chamber"] = group
                agenda = self.get_agenda(meeting)
                if agenda:
                    kwargs["agenda"] = agenda

                # Event prototype is as follows:
                # class Event(SourcedObject):
                #    def __init__(self, session, when, type,
                #                 description, location, end=None, **kwargs)
                event = Event(self.session, when, "committee:meeting", description, location, **kwargs)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 15
0
def actions_to_events(state):
    for bill in db.bills.find({'state': state}):
        print "Converting %s actions to events" % bill['_id']

        count = 1
        for action in bill['actions']:
            guid = "%s:action:%06d" % (bill['_id'], count)
            count += 1

            event = db.events.find_one({'state': state,
                                        '_guid': guid})

            description = "%s: %s" % (bill['bill_id'], action['action'])
            data = Event(bill['session'], action['date'],
                         'bill:action', description, location=action['actor'],
                         action_type=action['type'])
            data.add_participant('actor', action['actor'])
            data['_guid'] = guid
            data['state'] = state

            if not event:
                data['created_at'] = datetime.datetime.utcnow()
                data['updated_at'] = data['created_at']
                _insert_with_id(data)
            else:
                update(event, data, db.events)
Exemplo n.º 16
0
    def scrape(self, chamber, session):
        if session != '2011 Regular Session':
            raise NoDataForPeriod(session)

        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                event = Event(session,
                              when,
                              'committee:meeting',
                              desc,
                              location=location)
                event.add_source(url)

                self.save_event(event)
Exemplo n.º 17
0
    def scrape(self, chamber, session):
        r = requests.get("http://google.sannet.gov/search?num=100&requiredfields=PATH:councildockets|PATH:councilminutes|PATH:councilresults&getfields=DOCUMENT_URL.DOC_DATE.TITLE.SORTORDER&sort=date:D:S:d1&output=xml_no_dtd&ie=UTF-8&client=scs_ocd&filter=0&site=documents&config=sirecouncilmeetings.js&proxystylesheet=sirefrontend&q=Council+inmeta:DOC_DATE_NUM:20130101..20140101")
        soup = BeautifulSoup(r.text)
        table = soup.find_all('table')[-1]
        rows = table.findAll('tr')
        for row in rows:
            date_cell = row.findAll('script')[0].text
            if date_cell.startswith('build_date_cell'):
                date = date_cell[17:27]
                link = row.find('a')
                url = link['href']
                title = link.text
            
                when = "%s" % (date)
                when = datetime.datetime.strptime(when,
                                              "%Y-%M-%d")
                when = self._tz.localize(when)

                desc = title 
                #event = div.xpath("string(span[3])").strip()
                # XXX: Process `event' for related bills.
                event = Event(session, when, 'council:meeting',desc,
                                location=None)
                event.add_source(url)

                # desc is actually the ctty name.
                #event.add_participant('host', desc, 'committee',
                #                        chamber=chamber)

                self.save_event(event)
Exemplo n.º 18
0
    def scrape_upper_events(self, session):
        url = "http://flsenate.gov/Session/DailyCalendarRSS.cfm?format=rss"
        with self.urlopen(url) as page:
            feed = feedparser.parse(page)

            for entry in feed['entries']:
                if 'Committee' not in entry['summary']:
                    continue

                date = datetime.datetime(*entry['updated_parsed'][:6])
                match = re.match(r'(\d+):(\d+)', entry['title'])
                if match:
                    when = datetime.datetime(date.year, date.month, date.day,
                                             int(match.group(1)),
                                             int(match.group(2)), 0)
                    when = self._tz.localize(when)

                    desc = entry['summary'].split(' - ')[0]
                    location = entry['summary'].split(' - ')[1]

                    event = Event(session, when, 'committee:meeting', desc,
                                  location)
                    event.add_source(url)

                    self.save_event(event)
Exemplo n.º 19
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return  # XXX: Change to invocation?

        if chamber == "lower":
            self.house_meetings()

        url = "http://www.legislature.state.oh.us/today.cfm"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"):
                date = td.text.strip()

                if chamber == "upper":
                    desc = td.getnext().text.strip()
                else:
                    desc = td.getnext().getnext().text.strip()

                match = re.match(r"^Session at (\d+:\d+ [pa]\.m\.)", desc)
                if match:
                    time = match.group(1)
                    time = time.replace("a.m.", "AM").replace("p.m.", "PM")

                    when = "%s 2011 %s" % (date, time)
                    when = datetime.datetime.strptime(when, "%a. %b %d %Y %I:%M %p")
                    when = self._tz.localize(when)

                    chamber_name = {"upper": "Senate", "lower": "House"}[chamber]

                    event = Event(session, when, "floor_time", desc, "%s Chamber" % chamber_name)
                    event.add_source(url)
                    self.save_event(event)
Exemplo n.º 20
0
    def scrape(self, chamber, session):
        bills_discussed = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            bills_discussed[(location, date)].append(hearing.bill_id)

        for ((location, date), bills) in bills_discussed.iteritems():
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bills]

            desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location,
                                                             ', '.join(bills))

            event = Event(session, date, 'committee:meeting', desc,
                          location=location)
            event.add_participant('committee', location, 'committee')

            self.save_event(event)
Exemplo n.º 21
0
    def scrape_upper_events(self, session):
        url = "http://flsenate.gov/Session/DailyCalendarRSS.cfm?format=rss"
        with self.urlopen(url) as page:
            feed = feedparser.parse(page)

            for entry in feed['entries']:
                if 'Committee' not in entry['summary']:
                    continue

                date = datetime.datetime(*entry['updated_parsed'][:6])
                match = re.match(r'(\d+):(\d+)', entry['title'])
                if match:
                    when = datetime.datetime(date.year, date.month,
                                             date.day,
                                             int(match.group(1)),
                                             int(match.group(2)),
                                             0)
                    when = self._tz.localize(when)

                    desc = entry['summary'].split(' - ')[0]
                    location = entry['summary'].split(' - ')[1]

                    event = Event(session, when, 'committee:meeting',
                                  desc, location)
                    event.add_source(url)

                    self.save_event(event)
Exemplo n.º 22
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.urlopen(url)
        page = csv.reader(StringIO.StringIO(page.bytes), delimiter='|')

        for row in page:
            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:
                comm_chamber = {'HOUSE': 'lower',
                                'SENATE': 'upper'}[match.group(2)]
                if comm_chamber != chamber:
                    continue

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')

                event = Event(session, when, 'committee:meeting',
                              "%s MEETING" % comm,
                              location=location)
                event.add_source(url)

                event.add_participant('host', comm, chamber=chamber)

                time = row[3].strip()
                if time in TIMECODES:
                    event['notes'] = TIMECODES[time]

                self.save_event(event)
Exemplo n.º 23
0
    def scrape_committee_events(self, session, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info("Cancelled event found; it will be skipped: {}".
                          format(info['title']))
                continue

            event = Event(
                    session=session,
                    when=datetime.datetime.strptime(info['start'], DATETIME_FORMAT),
                    end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT),
                    type='committee:meeting',
                    description=info['title'],
                    location="{0} {1}".format(info['building'].strip(), info['location'].strip())
                    )
            event.add_source(events_url)

            self.save_event(event)
Exemplo n.º 24
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])

            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(session, when, 'committee:meeting', title,
                          location=where)
            event.add_source(url)

            self.save_event(event)
Exemplo n.º 25
0
    def scrape_page(self, url, chamber, session):
        page = self.lxmlize(url)
        info_blocks = {
            "canceled": "//div[@class='cancelled']",
            "committee": "//div[@class='titlemeetingtype']",
            "chamber": "//div[@class='titlehouse']",
            "datetime": "//div[@class='datetimelocation']"
        }
        metainf = {}
        for block in info_blocks:
            info = page.xpath(info_blocks[block])
            if info == []:
                continue
            metainf[block] = {
                "obj": info[0],
                "txt": info[0].text_content()
            }

        if 'committee' not in metainf:
            return

        if 'canceled' in metainf:
            return

        obj = metainf['datetime']['obj']
        dates = obj.xpath("./*")
        date_time = obj.text.strip()
        for date in dates:
            if date.tail is not None:
                date_time += " %s" % (date.tail.strip())
        # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol
        splits = [ 'AM', 'PM' ]
        date_times = None
        for split in splits:
            if split in date_time:
                date_times = [ x.strip() for x in date_time.split(split, 1) ]
                date_times[0] += " " + split

        time = date_times[0]
        place = date_times[1]

        committee = metainf['committee']['txt']
        chamber = metainf['chamber']['txt']

        try:
            chamber = {
                "Senate": "upper",
                "Assembly": "lower",
                "Joint": "joint"
            }[chamber]
        except KeyError:
            chamber = 'other'

        # Wednesday, May 23, 2012 10:00 AM
        datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p")
        event = Event(session, datetime, 'committee:meeting',
                      committee, location=place)
        event.add_participant('host', committee, 'committee', chamber=chamber)
        event.add_source(url)
        self.save_event(event)
Exemplo n.º 26
0
    def scrape(self, chamber, session):
        seen = set()
        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            if (location, date) in seen:
                continue
            seen.add((location, date))

            desc = 'Committee Meeting\n%s' % location

            event = Event(session, date, 'committee:meeting', desc,
                          location=location)
            event.add_participant('committee', location)

            self.save_event(event)
Exemplo n.º 27
0
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                kwargs = {}
                if group in self.metadata['chambers'].keys():
                    kwargs['chamber'] = group
                agenda = self.get_agenda(meeting)
                if agenda:
                    kwargs['agenda'] = agenda

                # Event prototype is as follows:
                # class Event(SourcedObject):
                #    def __init__(self, session, when, type,
                #                 description, location, end=None, **kwargs)
                event = Event(self.session, when, 'committee:meeting',
                              description, location, **kwargs)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 28
0
    def scrape_upper(self, session):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'Meeting_Notice')]"):
            comm = link.text.strip()
            comm = re.sub(r'\s+', ' ', comm)

            if link.getnext().text == 'Cancelled':
                continue

            date_path = "../../preceding-sibling::p[@class='MsoNormal']"
            date = link.xpath(date_path)[-1].xpath("string()")

            time_loc = link.xpath("../br")[0].tail.strip()
            time = re.match("\d+:\d+ (am|pm)", time_loc).group(0)
            location = time_loc.split(', ')[1].strip()

            dt = "%s %s" % (date, time)
            dt = datetime.datetime.strptime(dt, "%A, %B %d, %Y %I:%M %p")

            event = Event(session, dt, 'committee:meeting',
                          "%s Committee Meeting" % comm,
                          location)
            event.add_source(url)
            self.save_event(event)
Exemplo n.º 29
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.urlopen(url)
        page = csv.reader(StringIO.StringIO(page.bytes), delimiter="|")

        for row in page:
            desc = row[7].strip()

            match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc)
            if match:
                comm_chamber = {"HOUSE": "lower", "SENATE": "upper"}[match.group(2)]
                if comm_chamber != chamber:
                    continue

                comm = match.group(1).strip()
                comm = re.sub(r"\s+", " ", comm)
                location = row[5].strip() or "Unknown"
                when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")

                event = Event(session, when, "committee:meeting", "%s MEETING" % comm, location=location)
                event.add_source(url)

                event.add_participant("committee", comm, chamber=chamber)

                time = row[3].strip()
                if time in TIMECODES:
                    event["notes"] = TIMECODES[time]

                self.save_event(event)
Exemplo n.º 30
0
    def scrape(self, chamber, session):
        if session != '2011 Regular Session':
            raise NoDataForPeriod(session)

        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                event = Event(session, when, 'committee:meeting',
                              desc, location=location)
                event.add_source(url)

                self.save_event(event)
Exemplo n.º 31
0
    def scrape_committee_events(self, session, code, name):
        url = "http://www.cga.ct.gov/asp/menu/" "CGACommCal.asp?comm_code=%s" % code
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            cal_table = page.xpath("//table[contains(@summary, 'Calendar')]")[0]

            date_str = None
            for row in cal_table.xpath("tr[2]//tr"):
                col1 = row.xpath("string(td[1])").strip()
                col2 = row.xpath("string(td[2])").strip()

                if not col1:
                    if col2 == "No Meetings Scheduled":
                        return
                    # If col1 is empty then this is a date header
                    date_str = col2
                else:
                    # Otherwise, this is a committee event row
                    when = date_str + " " + col1
                    when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p")
                    when = self._tz.localize(when)

                    location = row.xpath("string(td[3])").strip()
                    guid = row.xpath("td/a")[0].attrib["href"]

                    event = Event(session, when, "committee meeting", col2, location, _guid=guid)
                    event.add_source(url)
                    event.add_participant("committee", name, chamber="joint")

                    self.save_event(event)
Exemplo n.º 32
0
    def scrape_upper_events(self, session):
        url = (
            "http://www.nysenate.gov/calendar/ical/"
            "senator%3DAll%2526type%3D3%2526committee%3DAll"
            "%2526initiative%3DAll"
        )

        with self.urlopen(url) as page:
            cal = icalendar.Calendar.from_string(page)

            for comp in cal.walk():
                if comp.name != "VEVENT":
                    continue

                text = str(comp["SUMMARY"])
                if "Committee Meeting" not in text:
                    continue

                start = _tz.localize(comp["DTSTART"].dt)
                end = _tz.localize(comp["DTEND"].dt)
                uid = str(comp["UID"])
                event_url = comp["URL"]

                location = self.get_upper_location(event_url)

                event = Event(session, start, "committee:meeting", text, location, end)
                event.add_source(url)
                event.add_source(event_url)

                self.save_event(event)
Exemplo n.º 33
0
    def scrape(self, chamber, session):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                agenda = div.xpath("string(span[3])").strip()
                # XXX: Process `agenda' for related bills.
                event = Event(session, when, 'committee:meeting',
                              desc, location=location)
                event.add_source(url)

                # desc is actually the ctty name.
                event.add_participant('host', desc, 'committee',
                                      chamber=chamber)

                self.save_event(event)
Exemplo n.º 34
0
    def scrape(self, chamber, session):
        if session != '27':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year, year2 = None, None
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (session, date1, date2))

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
            for font in page.xpath(path):
                match = re.match(r'^\((H|S)\)(.+)$', font.text)

                chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
                comm = match.group(2).strip().title()

                next_row = font.xpath("../../following-sibling::tr[1]")[0]

                when = next_row.xpath("string(td[1]/font)").strip()
                when = datetime.datetime.strptime(when + " " + year,
                                                  "%b %d  %A %I:%M %p %Y")
                when = self._tz.localize(when)

                where = next_row.xpath("string(td[2]/font)").strip()

                description = "Committee Meeting\n"
                description += comm

                links = font.xpath(
                    "../../td/font/a[contains(@href, 'get_documents')]")
                if links:
                    agenda_link = links[0]
                    print agenda_link
                    event['link'] = agenda_link.attrib['href']

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=where)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 35
0
    def scrape(self, chamber, session):
        if chamber != "other":
            return None
        page = self.lxmlize(url)
        meetings = page.xpath("//div[@class='Comm_item']")
        for meeting in meetings:
            metas = meeting.xpath(".//b")
            ctty = meeting.xpath(".//a")[0]
            ctty_name = ctty.text_content()
            info = metas[1:]
            datetime = metas[0]
            metainf = {}
            for meta in info:
                header = meta.text_content().strip()
                val = meta.tail
                metainf[header] = val or ""
            datetime = datetime.text_content().strip()
            # Tuesday, June 05, 2012 9:00 AM
            if "Canceled" in datetime:
                continue

            formats = [
               "%A, %B %d, %Y %I:%M %p",
               "%A, %B %d, %Y"
            ]
            date_time = None

            for fmt in formats:
                try:
                    date_time = dt.datetime.strptime(
                        datetime, fmt)
                except ValueError:
                    pass

            if date_time is None:
                continue

            event = Event(chamber,
                          date_time,
                          'committee:meeting',
                          ctty_name,
                          location=metainf['Room:'] or "State House"
                         )
            event.add_source(url)

            chamber = "other"
            chambers = {
                "house": "lower",
                "joint": "joint",
                "senate": "upper",
            }
            for c in chambers:
                if c in ctty_name.lower():
                    chamber = chambers[c]

            event.add_participant('host', ctty_name, chamber=chamber)
            # add chair?

            self.save_event(event)
Exemplo n.º 36
0
    def scrape(self, chamber, session):
        start_date = "%s-01-10T00:00:00" % session[0:4]
        end_date = "%d-01-10T00:00:00" % (int(session[5:10]) + 1)

        url = ("http://wslwebservices.leg.wa.gov/CommitteeMeetingService"
x               ".asmx/GetCommitteeMeetings?beginDate=%s"
               "&endDate=%s" % (start_date, end_date))

        expected_agency = {'upper': 'Senate', 'lower': 'House'}[chamber]

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page)

            for meeting in page.xpath(
                "//wa:CommitteeMeeting", namespaces=self._ns):

                cancelled = meeting.xpath(
                    "string(wa:Cancelled)", namespaces=self._ns).strip()
                if cancelled.lower() == "true":
                    continue

                agency = meeting.xpath(
                    "string(wa:Agency)",
                    namespaces=self._ns).strip()

                if agency != expected_agency:
                    continue

                dt = meeting.xpath("string(wa:Date)", namespaces=self._ns)
                dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S")

                room = meeting.xpath("string(wa:Room)", namespaces=self._ns)
                building = meeting.xpath(
                    "string(wa:Building)", namespaces=self._ns)
                location = "%s, %s" % (room, building)

                comm = meeting.xpath(
                    "string(wa:Committees/wa:Committee[1]/wa:Name)",
                    namespaces=self._ns)

                desc = "Committee Meeting\n%s" % comm

                guid = meeting.xpath(
                    "string(wa:AgendaId)", namespaces=self._ns)

                event = Event(session, dt, 'committee:meeting',
                              desc, location=location, _guid=guid)

                for comm_part in meeting.xpath(
                    "wa:Committees/wa:Committee", namespaces=self._ns):
                    name = comm_part.xpath("string(wa:Name)",
                                           namespaces=self._ns)
                    agency = comm_part.xpath("string(wa:Agency)",
                                             namespaces=self._ns)
                    name = "%s %s Committee" % (agency, name)

                    event.add_participant('committee', name)

                self.save_event(event)
Exemplo n.º 37
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)",
                            when_and_where).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 38
0
    def parse_row(self, row, session, chamber):
        dates = row.xpath("./td[@class='dateCell']")
        for date in dates:
            # alright, so we *may* not get a date, in which case the date
            # is the same as the last event.
            cal_date = date.xpath("./span[@class='calendarMonth']")[0]
            cal_day = date.xpath("./span[@class='calendarDay']")[0]
            self.last_month = cal_date.text_content()
            self.last_day = cal_day.text_content()
        time = row.xpath("./td[@class='timeCell']")
        if not time:
            return  # Nada.
        time = time[0]
        time = time.text.strip()
        dt_string = "%s %s %s %s" % (
            self.last_month,
            self.last_day,
            self.year,
            time
        )
        fmt = "%b %d %Y %I:%M %p"
        when = dt.datetime.strptime(dt_string, fmt)
        cells = {
            "event": "eventCell",
            "status": "statusCell",
            "location": "locationCell",
            "transcript": "transcriptCell",
            "video": "videoCell"
        }
        metainf = {}
        for thing in cells:
            mi = row.xpath("./td[@class='" + cells[thing] + "']")
            if mi == []:
                continue
            metainf[thing] = mi[0]

        if metainf['location'].xpath("./*") == []:
            metainf['location'] = self.last_location
        else:
            self.last_location = metainf['location']

        if "Session" in metainf['event'].text_content().strip():
            return  # Nada.

        loc_url = metainf['location'].xpath(".//a")
        loc_url = loc_url[0].attrib['href']
        event = Event(session,
                      when,
                      'committee:meeting',
                      metainf['event'].text_content().strip(),
                      chamber=chamber,
                      location=metainf['location'].text_content().strip(),
                      location_url=loc_url)
        event.add_participant("host", metainf['event'].text_content().strip(),
                              'committee', chamber=chamber)
        self.add_agenda(event, metainf['event']
                        .xpath(".//a")[0].attrib['href'])
        return event
Exemplo n.º 39
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [
                    committee,
                ]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids[committee]
                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host',
                                      committee['name'],
                                      'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name, notice_href, mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(bill['bill_id'],
                                       description=bill['descr'],
                                       type=bill['type'])

            self.save_event(event)
Exemplo n.º 40
0
    def parse_row(self, row, session, chamber):
        dates = row.xpath("./td[@class='dateCell']")
        for date in dates:
            # alright, so we *may* not get a date, in which case the date
            # is the same as the last event.
            cal_date = date.xpath("./span[@class='calendarMonth']")[0]
            cal_day = date.xpath("./span[@class='calendarDay']")[0]
            self.last_month = cal_date.text_content()
            self.last_day = cal_day.text_content()
        time = row.xpath("./td[@class='timeCell']")
        if not time:
            return  # Nada.
        time = time[0]
        time = time.text.strip()
        dt_string = "%s %s %s %s" % (
            self.last_month,
            self.last_day,
            self.year,
            time
        )
        fmt = "%b %d %Y %I:%M %p"
        when = dt.datetime.strptime(dt_string, fmt)
        cells = {
            "event": "eventCell",
            "status": "statusCell",
            "location": "locationCell",
            "transcript": "transcriptCell",
            "video": "videoCell"
        }
        metainf = {}
        for thing in cells:
            mi = row.xpath("./td[@class='" + cells[thing] + "']")
            if mi == []:
                continue
            metainf[thing] = mi[0]

        if metainf['location'].xpath("./*") == []:
            metainf['location'] = self.last_location
        else:
            self.last_location = metainf['location']

        if "Session" in metainf['event'].text_content().strip():
            return  # Nada.

        loc_url = metainf['location'].xpath(".//a")
        loc_url = loc_url[0].attrib['href']
        event = Event(session,
                      when,
                      'committee:meeting',
                      metainf['event'].text_content().strip(),
                      chamber=chamber,
                      location=metainf['location'].text_content().strip(),
                      location_url=loc_url)
        event.add_participant("host", metainf['event'].text_content().strip(),
                              'committee', chamber=chamber)
        self.add_agenda(event, metainf['event'].xpath(".//a")[0].attrib['href'])
        return event
Exemplo n.º 41
0
    def scrape(self, chamber, session):
        chmbr = cal_chamber_text[chamber]
        tables = self.url_xpath(cal_weekly_events,
                                "//table[@class='date-table']")
        for table in tables:
            date = table.xpath("../.")[0].getprevious().text_content()
            trs = table.xpath("./tr")
            for tr in trs:
                order = ["time", "chamber", "type", "agenda", "location",
                         "video"]

                tds = tr.xpath("./td")
                metainf = {}

                if not tds:
                    continue

                for el in range(0, len(order)):
                    metainf[order[el]] = tds[el]

                if metainf['chamber'].text_content() == chmbr:
                    self.log("Skipping event based on chamber.")
                    continue

                time = metainf['time'].text_content()
                datetime_string = "%s %s" % (date, time)
                location = metainf['location'].text_content()
                description = metainf['type'].text_content()

                dtfmt = "%A, %B %d, %Y %I:%M %p"
                if time == 'Cancelled':
                    self.log("Skipping cancelled event.")
                    continue
                else:
                    if ' Immediately follows' in datetime_string:
                        datetime_string, _ = datetime_string.split(
                            'Immediately follows')
                        datetime_string = datetime_string.strip()
                        dtfmt = "%A, %B %d, %Y"

                    when = dt.datetime.strptime(datetime_string, dtfmt)
                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_participant(
                    "host", description, 'committee', chamber=chamber)
                event.add_source(cal_weekly_events)

                agenda = metainf['agenda'].xpath(".//a")
                if len(agenda) > 0:
                    agenda = agenda
                    for doc in agenda:
                        if not doc.text_content():
                            continue
                        agenda_url = doc.attrib['href']
                        self.add_agenda(
                            agenda_url, doc.text_content(), event)
                self.save_event(event)
Exemplo n.º 42
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == "upper":
            chamber_abbrev = "S"
        else:
            chamber_abbrev = "H"

        url = (
            "http://www.legis.iowa.gov/Schedules/meetingsList"
            "Chamber.aspx?chamber=%s&bDate=%02d/%02d/"
            "%d&eDate=%02d/%02d/%d"
            % (
                chamber_abbrev,
                start_date.month,
                start_date.day,
                start_date.year,
                end_date.month,
                end_date.day,
                end_date.year,
            )
        )

        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        for link in page.xpath("//a[contains(@id, 'linkCommittee')]"):
            comm = link.text.strip()
            desc = comm + " Committee Hearing"
            location = link.xpath("string(../../td[3])")

            when = link.xpath("string(../../td[1])").strip()
            if when == "Cancelled" or "Upon" in when:
                continue
            if "To Be Determined" in when:
                continue

            if "AM" in when:
                when = when.split("AM")[0] + " AM"
            else:
                when = when.split("PM")[0] + " PM"

            junk = ["Reception"]
            for key in junk:
                when = when.replace(key, "")

            when = re.sub("\s+", " ", when).strip()
            when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session, when, "committee:meeting", desc, location)
            event.add_source(url)
            event.add_participant("host", comm, "committee", chamber=chamber)
            self.save_event(event)
Exemplo n.º 43
0
    def scrape(self, chamber, session):
        if session != '27':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year, year2 = None, None
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (
                   session, date1, date2))

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
            for font in page.xpath(path):
                match = re.match(r'^\((H|S)\)(.+)$', font.text)

                chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
                comm = match.group(2).strip().title()

                next_row = font.xpath("../../following-sibling::tr[1]")[0]

                when = next_row.xpath("string(td[1]/font)").strip()
                when = datetime.datetime.strptime(when + " " + year,
                                                  "%b %d  %A %I:%M %p %Y")
                when = self._tz.localize(when)

                where = next_row.xpath("string(td[2]/font)").strip()

                description = "Committee Meeting\n"
                description += comm

                links = font.xpath(
                    "../../td/font/a[contains(@href, 'get_documents')]")
                if links:
                    agenda_link = links[0]
                    print agenda_link
                    event['link'] = agenda_link.attrib['href']

                event = Event(session, when, 'committee:meeting',
                              description, location=where)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 44
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmt = "%A, %B %d, %Y"
        if time in all_day:
            datetime = date
        else:
            fmt += " %I:%M %p"
            datetime = "%s %s" % ( date, time )
        datetime = dt.datetime.strptime(datetime, fmt)

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, chamber=chamber)

        self.save_event(event)
Exemplo n.º 45
0
def test_event():
    e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting',
              'event description', 'event location')
    e.add_document('agenda', 'http://example.com/event/agenda.txt')
    e.add_related_bill('HB 1', relation='considered')
    assert_equal(e['documents'],
                 [{'name': 'agenda',
                   'url': 'http://example.com/event/agenda.txt',
                   'type': 'other'}])
    assert_equal(e['related_bills'],
                 [{'bill_id': 'HB 1', 'relation': 'considered'}])
Exemplo n.º 46
0
    def scrape(self, session, chambers):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(session, comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          name,
                          location=where,
                          link=notice)

            event.add_source(calurl)
            event.add_participant('host',
                                  cttie,
                                  'committee',
                                  chamber=cttie_chamber)
            event.add_document("notice", notice, mimetype='application/pdf')

            for thing in who:
                event.add_participant(thing['title'],
                                      thing['name'],
                                      'legislator',
                                      chamber=cttie_chamber)

            self.save_event(event)
Exemplo n.º 47
0
    def scrape(self, session, chambers):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if time == "Noon":
                time = "12:00pm"

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            try:
                when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p")
            except ValueError:
                when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p")

            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            if desc.lower().strip() in ["house convenes","senate convenes"]:
                continue

            event = Event(session, when, 'committee:meeting',
                          desc, location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            if "house" in desc.lower():
                chamber = "lower"
            elif "senate" in desc.lower():
                chamber = "upper"
            elif "joint" in desc.lower():
                chamber = "joint"
            else:
                self.logger.warning("Event %s chamber is unknown, skipping" % desc)
                continue

            event.add_participant('host', desc, 'committee', chamber = chamber)

            self.save_event(event)
Exemplo n.º 48
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../../td[1])").strip()

            when_and_where = link.xpath("string(../../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 49
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainf = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainf[key.strip()] = val.strip()
        ctty = metainf['COMMITTEE']
        where = metainf['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainf['PLACE'] = where.strip()
            metainf['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainf:
            chair = metainf['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      ctty,
                      chamber=chamber,
                      location=where,
                      agenda=plaintext)
        event.add_source(url)
        event.add_participant('host', ctty, 'committee', chamber=chamber)
        if not chair is None:
            event.add_participant('chair',
                                  chair,
                                  'legislator',
                                  chamber=chamber)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            event.add_related_bill(bill_id,
                                   type='consideration',
                                   description='Bill up for discussion')

        self.save_event(event)
Exemplo n.º 50
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        with self.urlopen(url) as agendas:
            root = html.fromstring(agendas)
            if chamber == 'upper':
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table')[0]
            else:
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table/tr/td/table')[0]
            for row in event_table.xpath('tr')[2:]:
                # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
                # HTML Document, PDF Document for house
                # Agenda Date, Committee, Revised, Cancelled, Time, Room,
                # HTML Document, PDF Document for senate
                text = [x.text_content().strip() for x in row.xpath('td')]
                when, committee = text[0:2]
                if chamber == 'upper':
                    time, room = text[4:6]
                    link = row[6].xpath('string(a/@href)')
                else:
                    time, room = text[5:7]
                    link = row[7].xpath('string(a/@href)')
                if 'NOT MEETING' in time or 'CANCELLED' in time:
                    continue
                time = re.match('(\d+:\d+ (A|P))', time)
                if time:
                    when = "%s %sM" % (text[0], time.group(0))
                    when = datetime.datetime.strptime(when,
                                                      '%m/%d/%Y %I:%M %p')
                else:
                    when = text[0]
                    when = datetime.datetime.strptime(when, '%m/%d/%Y')

                when = self._tz.localize(when)

                title = "Committee Meeting:\n%s %s %s\n" % (
                    self._chamber_long[chamber], committee, room)
                (description, member_list, meeting_type,
                 other) = self.parse_agenda(chamber, link)
                event = Event(session,
                              when,
                              'committee:meeting',
                              title,
                              location=room,
                              link=link,
                              details=description)
                event.add_participant('committee', committee)
                event['participants'].extend(member_list)
                event.add_source(url)
                event.add_source(link)
                self.save_event(event)
Exemplo n.º 51
0
    def scrape(self, session, chambers):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath(
                '//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            event = Event(
                    session=session,
                    when=self._TZ.localize(datetime.datetime.strptime(
                            info.xpath('span[@class="col02"]/text()')[0],
                            self._DATETIME_FORMAT
                            )),
                    type='committee:meeting',
                    description=" ".join(x.strip() for x
                            in doc.xpath('//div[@class="schedule"]//text()')
                            if x.strip()),
                    location=doc.xpath(
                            '//div[@class="heading-container"]/span/text()')
                            [0].title()
                    )

            event.add_participant(
                    type='host',
                    participant=info.xpath(
                            'span[@class="col01"]/text()')[0].title(),
                    participant_type='committee'
                    )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=document.xpath('@href')[0]
                        )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            self.save_event(event)
Exemplo n.º 52
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '')
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                                     '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(',') if s] + [None] * 3)[:3]
            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee_name,
                                  'committee',
                                  chamber='lower')
            event.add_document('Agenda',
                               guid,
                               type='agenda',
                               mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 53
0
    def scrape_upper_events(self, session):
        url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar"
        page = self.get(url).text
        feed = feedparser.parse(page)

        for entry in feed['entries']:            
            #The feed breaks the RSS standard by making the pubdate the actual event's date, not the RSS item publish date
            when = datetime.datetime(*entry['published_parsed'][:6])

            desc = entry['summary'].split(' - ')[0]
            location = entry['summary'].split(' - ')[1]

            event = Event(session, when, 'committee:meeting',
                              desc, location)
            event.add_source(entry['link'])

            self.save_event(event)
Exemplo n.º 54
0
    def process_event(self, data):
        session = self.metadata['terms'][-1]['name']

        event = Event(session=session,
                      when=parse_datetime(data['start_time']),
                      type='committee:meeting',
                      description=data['description'],
                      timezone=data['timezone'],
                      location=data['location']['name'],
                      end=data['end_time'])

        # TODO: participants, documents, related_bills

        for source in data['sources']:
            event.add_source(source['url'])

        self.save_event(event)
Exemplo n.º 55
0
    def scrape(self, chamber, session):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.iteritems():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=committee_name)
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                event.add_related_bill(bill_id,
                                       type=type_,
                                       description='consideration')

            event.add_participant('host',
                                  committee_name + ' Committee',
                                  'committee',
                                  chamber=chamber)
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            self.save_event(event)
Exemplo n.º 56
0
    def scrape_lower_events(self, session):
        url = "http://assembly.state.ny.us/leg/?sh=hear"

        year = datetime.date.today().year

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for td in page.xpath("//td[@bgcolor='#99CCCC']"):
                desc = td.xpath("string(following-sibling::td/strong)")
                if 'Senate Standing Committee' in desc:
                    # We should pick these up from the upper scraper
                    continue

                notes = td.xpath("string(../following-sibling::tr[1]/td[2])")
                notes = re.sub(r'\*\*Click here to view hearing notice\*\*',
                               '', notes).strip()

                location = td.xpath(
                    "string(../following-sibling::tr[2]/td[2])")

                date = ' '.join(td.text.split()[0:2]).strip()

                time = td.xpath("../following-sibling::tr[3]/td[2]")[0]
                split_time = time.text.split('-')

                when = "%s %d %s" % (date, year, split_time[0].strip())
                when = _parse_date(when.replace('.', ''))

                end = None
                if len(split_time) > 1:
                    end = "%s %d %s" % (date, year, split_time[1].strip())
                    end = _parse_date(end.replace('.', ''))

                event = Event(session,
                              when,
                              'committee:meeting',
                              desc,
                              location,
                              end=end,
                              notes=notes)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 57
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath(
            "//span[@class='heading']")[0].text_content().replace(
                "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      description,
                      location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
Exemplo n.º 58
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            (title, when) = info.xpath('title/text()')[0].split(" - ")
            if not when.endswith(session[ :len("20XX")]):
                continue

            event = Event(
                    session=session,
                    when=datetime.datetime.strptime(when, '%b %d, %Y'),
                    type='committee:meeting',
                    description=title,
                    location='State Capitol'
                    )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(
                        type='host',
                        participant=committee_name,
                        participant_type='committee'
                        )

            documents = doc.xpath('.//td')
            for document in documents:
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]).group(1),
                        mimetype='application/pdf'
                        )

            self.save_event(event)
Exemplo n.º 59
0
    def scrape(self, chamber, session):
        if chamber == 'upper':
            url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM"
        else:
            url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for date_td in page.xpath("//td[@valign='middle']"):
                date = date_td.text.strip()

                datetime.datetime.strptime(date_td.text.strip(),
                                           "%A, %B %d, %Y").date()

                next_tr = date_td.getparent().getnext()
                while next_tr is not None:
                    if next_tr.xpath("td[@valign='middle']"):
                        break

                    time = next_tr.xpath("string(td[1])").strip()
                    dt = "%s %s" % (date, time)

                    try:
                        dt = datetime.datetime.strptime(
                            dt, "%A, %B %d, %Y %I:%M %p")
                        dt = self._tz.localize(dt)
                    except ValueError:
                        break

                    desc = next_tr.xpath("string(td[2])").strip()
                    desc = re.sub(r'\s+', ' ', desc)

                    location = next_tr.xpath("string(td[3])").strip()
                    location = re.sub(r'\s+', ' ', location)

                    event = Event(session, dt, 'committee:meeting', desc,
                                  location)
                    event.add_source(url)
                    self.save_event(event)

                    next_tr = next_tr.getnext()
Exemplo n.º 60
0
    def scrape_committee_events(self, session, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.urlopen(events_url)
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:
            event = Event(
                    session=session,
                    when=datetime.datetime.strptime(info['start'], DATETIME_FORMAT),
                    end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT),
                    type='committee:meeting',
                    description=info['title'],
                    location="{0} {1}".format(info['building'].strip(), info['location'].strip())
                    )
            event.add_source(events_url)

            self.save_event(event)