示例#1
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                guid = link.attrib['href']

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)



                description = 'Committee Meeting: %s' % committee

                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_participant('committee', committee)
                event['link'] = guid

                self.save_event(event)
示例#2
0
    def scrape_page(self, url, session, chamber):
        try:
            page = self.lxmlize(url)
        except lxml.etree.XMLSyntaxError:
            self.warning("Ugh. Invalid HTML")
            return  # Ugh, invalid HTML.
        agendas = page.xpath("//td[@class='numberspace']")

        spans = page.xpath("//center/span")
        ctty = None
        date = None
        time = None
        if len(spans) >= 4:
            ctty = spans[0].text_content().strip()
            date = spans[2].text_content().strip()
            time = spans[3].text_content().strip()

        bills = []
        for agenda in agendas:
            number = agenda.text_content()
            string = agenda.getnext().text_content().strip()
            re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string)
            for bill in re_bills:
                bill_id = "%s%s %s" % bill
                bills.append({"name": bill_id, "desc": string})

        if ctty is None or date is None or time is None:
            return

        datetime = "%s %s" % (date.strip(), time.strip())
        datetime = re.sub("AGENDA", "", datetime).strip()
        datetime = [x.strip() for x in datetime.split("\r\n")]

        if "" in datetime:
            datetime.remove("")

        if len(datetime) == 1:
            datetime.append("state house")

        where = datetime[1]
        translate = {"a.m.": "AM", "p.m.": "PM"}
        for t in translate:
            datetime[0] = datetime[0].replace(t, translate[t])
        datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p")

        chamber = "other"
        cLow = ctty.lower()
        if "seante" in cLow:
            chamber = "upper"
        elif "house" in cLow:
            chamber = "lower"
        elif "joint" in cLow:
            chamber = "joint"

        event = Event(session, datetime, "committee:meeting", ctty, location=where)
        event.add_source(url)
        event.add_participant("host", ctty, "committee", chamber=chamber)
        for bill in bills:
            event.add_related_bill(bill["name"], description=bill["desc"], type="consideration")
        self.save_event(event)
示例#3
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.urlopen(url)
        page = csv.reader(StringIO.StringIO(page.bytes), delimiter="|")

        for row in page:
            desc = row[7].strip()

            match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc)
            if match:
                comm_chamber = {"HOUSE": "lower", "SENATE": "upper"}[match.group(2)]
                if comm_chamber != chamber:
                    continue

                comm = match.group(1).strip()
                comm = re.sub(r"\s+", " ", comm)
                location = row[5].strip() or "Unknown"
                when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")

                event = Event(session, when, "committee:meeting", "%s MEETING" % comm, location=location)
                event.add_source(url)

                event.add_participant("committee", comm, chamber=chamber)

                time = row[3].strip()
                if time in TIMECODES:
                    event["notes"] = TIMECODES[time]

                self.save_event(event)
示例#4
0
    def scrape(self, chamber, session):
        bills_discussed = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            bills_discussed[(location, date)].append(hearing.bill_id)

        for ((location, date), bills) in bills_discussed.iteritems():
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bills
            ]

            desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location,
                                                             ', '.join(bills))

            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_participant('committee', location)

            self.save_event(event)
示例#5
0
    def scrape_page(self, url, chamber, session):
        page = self.lxmlize(url)
        info_blocks = {
            "canceled": "//div[@class='cancelled']",
            "committee": "//div[@class='titlemeetingtype']",
            "chamber": "//div[@class='titlehouse']",
            "datetime": "//div[@class='datetimelocation']"
        }
        metainf = {}
        for block in info_blocks:
            info = page.xpath(info_blocks[block])
            if info == []:
                continue
            metainf[block] = {
                "obj": info[0],
                "txt": info[0].text_content()
            }

        if 'committee' not in metainf:
            return

        if 'canceled' in metainf:
            return

        obj = metainf['datetime']['obj']
        dates = obj.xpath("./*")
        date_time = obj.text.strip()
        for date in dates:
            if date.tail is not None:
                date_time += " %s" % (date.tail.strip())
        # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol
        splits = [ 'AM', 'PM' ]
        date_times = None
        for split in splits:
            if split in date_time:
                date_times = [ x.strip() for x in date_time.split(split, 1) ]
                date_times[0] += " " + split

        time = date_times[0]
        place = date_times[1]

        committee = metainf['committee']['txt']
        chamber = metainf['chamber']['txt']

        try:
            chamber = {
                "Senate": "upper",
                "Assembly": "lower",
                "Joint": "joint"
            }[chamber]
        except KeyError:
            chamber = 'other'

        # Wednesday, May 23, 2012 10:00 AM
        datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p")
        event = Event(session, datetime, 'committee:meeting',
                      committee, location=place)
        event.add_participant('host', committee, 'committee', chamber=chamber)
        event.add_source(url)
        self.save_event(event)
示例#6
0
    def scrape(self, chamber, session):
        bills_discussed = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            bills_discussed[(location, date)].append(hearing.bill_id)

        for ((location, date), bills) in bills_discussed.iteritems():
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bills]

            desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location,
                                                             ', '.join(bills))

            event = Event(session, date, 'committee:meeting', desc,
                          location=location)
            event.add_participant('committee', location, 'committee')

            self.save_event(event)
示例#7
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.urlopen(url)
        page = csv.reader(StringIO.StringIO(page.bytes), delimiter='|')

        for row in page:
            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:
                comm_chamber = {'HOUSE': 'lower',
                                'SENATE': 'upper'}[match.group(2)]
                if comm_chamber != chamber:
                    continue

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')

                event = Event(session, when, 'committee:meeting',
                              "%s MEETING" % comm,
                              location=location)
                event.add_source(url)

                event.add_participant('host', comm, chamber=chamber)

                time = row[3].strip()
                if time in TIMECODES:
                    event['notes'] = TIMECODES[time]

                self.save_event(event)
示例#8
0
    def scrape_page(self, url, chamber, session):
        page = self.lxmlize(url)
        info_blocks = {
            "canceled": "//div[@class='cancelled']",
            "committee": "//div[@class='titlemeetingtype']",
            "chamber": "//div[@class='titlehouse']",
            "datetime": "//div[@class='datetimelocation']"
        }
        metainf = {}
        for block in info_blocks:
            info = page.xpath(info_blocks[block])
            if info == []:
                continue
            metainf[block] = {"obj": info[0], "txt": info[0].text_content()}

        if 'committee' not in metainf:
            return

        if 'canceled' in metainf:
            return

        obj = metainf['datetime']['obj']
        dates = obj.xpath("./*")
        date_time = obj.text.strip()
        for date in dates:
            if date.tail is not None:
                date_time += " %s" % (date.tail.strip())
        # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol
        splits = ['AM', 'PM']
        date_times = None
        for split in splits:
            if split in date_time:
                date_times = [x.strip() for x in date_time.split(split, 1)]
                date_times[0] += " " + split

        time = date_times[0]
        place = date_times[1]

        committee = metainf['committee']['txt']
        chamber = metainf['chamber']['txt']

        try:
            chamber = {
                "Senate": "upper",
                "Assembly": "lower",
                "Joint": "joint"
            }[chamber]
        except KeyError:
            chamber = 'other'

        # Wednesday, May 23, 2012 10:00 AM
        datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p")
        event = Event(session,
                      datetime,
                      'committee:meeting',
                      committee,
                      location=place)
        event.add_participant('host', committee, chamber=chamber)
        event.add_source(url)
        self.save_event(event)
示例#9
0
    def scrape(self, chamber, session):
        seen = set()
        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            if (location, date) in seen:
                continue
            seen.add((location, date))

            desc = 'Committee Meeting\n%s' % location

            event = Event(session, date, 'committee:meeting', desc,
                          location=location)
            event.add_participant('committee', location)

            self.save_event(event)
示例#10
0
    def scrape(self, chamber, session):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p")
            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            event = Event(session,
                          when,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            event.add_participant('host', desc, 'committee', chamber=chamber)

            self.save_event(event)
示例#11
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                guid = link.attrib['href']

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)

                description = 'Committee Meeting: %s' % committee

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=location)
                event.add_participant('committee', committee)
                event['link'] = guid

                self.save_event(event)
示例#12
0
    def scrape(self, chamber, session):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                agenda = div.xpath("string(span[3])").strip()
                # XXX: Process `agenda' for related bills.
                event = Event(session, when, 'committee:meeting',
                              desc, location=location)
                event.add_source(url)

                # desc is actually the ctty name.
                event.add_participant('host', desc, 'committee',
                                      chamber=chamber)

                self.save_event(event)
示例#13
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                try:
                    guid = link.attrib['href']
                except KeyError:
                    continue  # Sometimes we have a dead link. This is only on
                    # dead entries.

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)



                description = 'Committee Meeting: %s' % committee

                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_source(url)
                event.add_participant('host', committee, chamber='lower')
                event.add_document("Agenda", guid, type='agenda',
                                   mimetype="application/pdf")
                event['link'] = guid

                self.save_event(event)
示例#14
0
def actions_to_events(state):
    for bill in db.bills.find({'state': state}):
        print "Converting %s actions to events" % bill['_id']

        count = 1
        for action in bill['actions']:
            guid = "%s:action:%06d" % (bill['_id'], count)
            count += 1

            event = db.events.find_one({'state': state,
                                        '_guid': guid})

            description = "%s: %s" % (bill['bill_id'], action['action'])
            data = Event(bill['session'], action['date'],
                         'bill:action', description, location=action['actor'],
                         action_type=action['type'])
            data.add_participant('actor', action['actor'])
            data['_guid'] = guid
            data['state'] = state

            if not event:
                data['created_at'] = datetime.datetime.utcnow()
                data['updated_at'] = data['created_at']
                _insert_with_id(data)
            else:
                update(event, data, db.events)
示例#15
0
    def scrape_committee_events(self, session, code, name):
        url = "http://www.cga.ct.gov/asp/menu/" "CGACommCal.asp?comm_code=%s" % code
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            cal_table = page.xpath("//table[contains(@summary, 'Calendar')]")[0]

            date_str = None
            for row in cal_table.xpath("tr[2]//tr"):
                col1 = row.xpath("string(td[1])").strip()
                col2 = row.xpath("string(td[2])").strip()

                if not col1:
                    if col2 == "No Meetings Scheduled":
                        return
                    # If col1 is empty then this is a date header
                    date_str = col2
                else:
                    # Otherwise, this is a committee event row
                    when = date_str + " " + col1
                    when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p")
                    when = self._tz.localize(when)

                    location = row.xpath("string(td[3])").strip()
                    guid = row.xpath("td/a")[0].attrib["href"]

                    event = Event(session, when, "committee meeting", col2, location, _guid=guid)
                    event.add_source(url)
                    event.add_participant("committee", name, chamber="joint")

                    self.save_event(event)
示例#16
0
def actions_to_events(state):
    for bill in db.bills.find({'state': state}):
        print "Converting %s actions to events" % bill['_id']

        count = 1
        for action in bill['actions']:
            guid = "%s:action:%06d" % (bill['_id'], count)
            count += 1

            event = db.events.find_one({'state': state,
                                        '_guid': guid})

            description = "%s: %s" % (bill['bill_id'], action['action'])
            data = Event(bill['session'], action['date'],
                         'bill:action', description, location=action['actor'],
                         action_type=action['type'])
            data.add_participant('actor', action['actor'])
            data['_guid'] = guid
            data['state'] = state

            if not event:
                data['created_at'] = datetime.datetime.utcnow()
                data['updated_at'] = data['created_at']
                _insert_with_id(data)
            else:
                update(event, data, db.events)
示例#17
0
    def scrape_event(self, chamber, session, obj):
        meeting = obj['data']['meeting']
        date = int(meeting['meetingDateTime'])
        date = dt.datetime.fromtimestamp(date / 1000)
        if str(date.year) not in session:
            return
        description = 'Committee Meeting: ' + meeting['committeeName']
        event = Event(session,
                      date,
                      'committee:meeting',
                      description=description,
                      location=meeting['location'] or 'No location given.')
        event.add_source(obj['url'])
        event.add_participant('chair',
                              meeting['committeeChair'],
                              'legislator',
                              chamber='upper')
        event.add_participant('host',
                              meeting['committeeName'],
                              'committee',
                              chamber='upper')

        rgx = r'([a-z]+)(\d+)'
        for bill in meeting['bills']:
            raw_id = bill['senateBillNo']
            bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups())
            event.add_related_bill(bill_id,
                                   type='bill',
                                   description=bill['summary']
                                   or 'No description given.')
        return event
示例#18
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            if not when.endswith(session[:len("20XX")]):
                continue

            event = Event(session=session,
                          when=datetime.datetime.strptime(when, '%b %d, %Y'),
                          type='committee:meeting',
                          description=title,
                          location='State Capitol')
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                if committee_name.lower().startswith("Senate"):
                    chamber = "upper"
                elif committee_name.lower().startswith("House"):
                    chamber = "lower"
                else:
                    chamber = "joint"
                event.add_participant(type='host',
                                      participant=committee_name,
                                      participant_type='committee',
                                      chamber=chamber)

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)',
                                document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(name=document.xpath('text()')[0],
                                   url=url,
                                   mimetype='application/pdf')
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        event.add_related_bill(
                            bill_name,
                            type='consideration',
                            description='Bill up for discussion')

            self.save_event(event)
示例#19
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        html_ = self.get(url).text
        doc = html.fromstring(html_)
        if chamber == 'upper':
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                     '/td/table/tr/td/table')[0]
        else:
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                     '/td/table/tr/td/table/tr/td/table')[0]
        for row in event_table.xpath('tr')[2:]:
            # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
            # HTML Document, PDF Document for house
            # Agenda Date, Committee, Revised, Cancelled, Time, Room,
            # HTML Document, PDF Document for senate
            text = [x.text_content().strip() for x in row.xpath('td')]
            when, committee = text[0:2]
            if chamber == 'upper':
                time, room = text[4:6]
                link = row[6].xpath('string(a/@href)')
            else:
                time, room = text[5:7]
                link = row[7].xpath('string(a/@href)')
            if 'NOT MEETING' in time or 'CANCELLED' in time:
                continue
            time = re.match('(\d+:\d+ (A|P))', time)
            if time:
                when = "%s %sM" % (text[0], time.group(0))
                when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
            else:
                when = text[0]
                when = datetime.datetime.strptime(when, '%m/%d/%Y')

            title = "Committee Meeting:\n%s %s %s\n" % (
                                              self._chamber_long[chamber],
                                              committee, room)
            agenda_info = self.parse_agenda(chamber, link)

            description = agenda_info['description']
            member_list = agenda_info['member_list']
            related_bills = agenda_info['related_bills']

            event = Event(session, when, 'committee:meeting', title,
                          location=room, link=link, details=description,
                          related_bills=related_bills)
            event.add_participant('host', committee, 'committee',
                                  chamber=chamber)

            event['participants'].extend(member_list)
            event.add_source(url)
            event.add_source(link)
            # print event['when'].timetuple()
            # import ipdb;ipdb.set_trace()
            self.save_event(event)
示例#20
0
    def scrape(self, chamber, session):
        start_date = "%s-01-10T00:00:00" % session[0:4]
        end_date = "%d-01-10T00:00:00" % (int(session[5:10]) + 1)

        url = ("http://wslwebservices.leg.wa.gov/CommitteeMeetingService"
x               ".asmx/GetCommitteeMeetings?beginDate=%s"
               "&endDate=%s" % (start_date, end_date))

        expected_agency = {'upper': 'Senate', 'lower': 'House'}[chamber]

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page)

            for meeting in page.xpath(
                "//wa:CommitteeMeeting", namespaces=self._ns):

                cancelled = meeting.xpath(
                    "string(wa:Cancelled)", namespaces=self._ns).strip()
                if cancelled.lower() == "true":
                    continue

                agency = meeting.xpath(
                    "string(wa:Agency)",
                    namespaces=self._ns).strip()

                if agency != expected_agency:
                    continue

                dt = meeting.xpath("string(wa:Date)", namespaces=self._ns)
                dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S")

                room = meeting.xpath("string(wa:Room)", namespaces=self._ns)
                building = meeting.xpath(
                    "string(wa:Building)", namespaces=self._ns)
                location = "%s, %s" % (room, building)

                comm = meeting.xpath(
                    "string(wa:Committees/wa:Committee[1]/wa:Name)",
                    namespaces=self._ns)

                desc = "Committee Meeting\n%s" % comm

                guid = meeting.xpath(
                    "string(wa:AgendaId)", namespaces=self._ns)

                event = Event(session, dt, 'committee:meeting',
                              desc, location=location, _guid=guid)

                for comm_part in meeting.xpath(
                    "wa:Committees/wa:Committee", namespaces=self._ns):
                    name = comm_part.xpath("string(wa:Name)",
                                           namespaces=self._ns)
                    agency = comm_part.xpath("string(wa:Agency)",
                                             namespaces=self._ns)
                    name = "%s %s Committee" % (agency, name)

                    event.add_participant('committee', name)

                self.save_event(event)
示例#21
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        with self.urlopen(url) as agendas:
            root = html.fromstring(agendas)
            if chamber == 'upper':
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table')[0]
            else:
                event_table = root.xpath(
                    '//table[@id="body"]/tr/td/table[2]/tr'
                    '/td/table/tr/td/table/tr/td/table')[0]
            for row in event_table.xpath('tr')[2:]:
                # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
                # HTML Document, PDF Document for house
                # Agenda Date, Committee, Revised, Cancelled, Time, Room,
                # HTML Document, PDF Document for senate
                text = [x.text_content().strip() for x in row.xpath('td')]
                when, committee = text[0:2]
                if chamber == 'upper':
                    time, room = text[4:6]
                    link = row[6].xpath('string(a/@href)')
                else:
                    time, room = text[5:7]
                    link = row[7].xpath('string(a/@href)')
                if 'NOT MEETING' in time or 'CANCELLED' in time:
                    continue
                time = re.match('(\d+:\d+ (A|P))', time)
                if time:
                    when = "%s %sM" % (text[0], time.group(0))
                    when = datetime.datetime.strptime(when,
                                                      '%m/%d/%Y %I:%M %p')
                else:
                    when = text[0]
                    when = datetime.datetime.strptime(when, '%m/%d/%Y')

                when = self._tz.localize(when)

                title = "Committee Meeting:\n%s %s %s\n" % (
                    self._chamber_long[chamber], committee, room)
                (description, member_list, meeting_type,
                 other) = self.parse_agenda(chamber, link)
                event = Event(session,
                              when,
                              'committee:meeting',
                              title,
                              location=room,
                              link=link,
                              details=description)
                event.add_participant('committee', committee)
                event['participants'].extend(member_list)
                event.add_source(url)
                event.add_source(link)
                self.save_event(event)
示例#22
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)",
                            when_and_where).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#23
0
    def scrape(self, chamber, session):
        if chamber != "other":
            return None
        page = self.lxmlize(url)
        meetings = page.xpath("//div[@class='Comm_item']")
        for meeting in meetings:
            metas = meeting.xpath(".//b")
            ctty = meeting.xpath(".//a")[0]
            ctty_name = ctty.text_content()
            info = metas[1:]
            datetime = metas[0]
            metainf = {}
            for meta in info:
                header = meta.text_content().strip()
                val = meta.tail
                metainf[header] = val or ""
            datetime = datetime.text_content().strip()
            # Tuesday, June 05, 2012 9:00 AM
            if "Canceled" in datetime:
                continue

            formats = [
               "%A, %B %d, %Y %I:%M %p",
               "%A, %B %d, %Y"
            ]
            date_time = None

            for fmt in formats:
                try:
                    date_time = dt.datetime.strptime(
                        datetime, fmt)
                except ValueError:
                    pass

            if date_time is None:
                continue

            event = Event(chamber,
                          date_time,
                          'committee:meeting',
                          ctty_name,
                          location=metainf['Room:'] or "State House"
                         )
            event.add_source(url)

            chamber = "other"
            chambers = {
                "house": "lower",
                "joint": "joint",
                "senate": "upper",
            }
            for c in chambers:
                if c in ctty_name.lower():
                    chamber = chambers[c]

            event.add_participant('host', ctty_name, chamber=chamber)
            # add chair?

            self.save_event(event)
示例#24
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmt = "%A, %B %d, %Y"
        if time in all_day:
            datetime = date
        else:
            fmt += " %I:%M %p"
            datetime = "%s %s" % ( date, time )
        datetime = dt.datetime.strptime(datetime, fmt)

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, chamber=chamber)

        self.save_event(event)
示例#25
0
    def parse_row(self, row, session, chamber):
        dates = row.xpath("./td[@class='dateCell']")
        for date in dates:
            # alright, so we *may* not get a date, in which case the date
            # is the same as the last event.
            cal_date = date.xpath("./span[@class='calendarMonth']")[0]
            cal_day = date.xpath("./span[@class='calendarDay']")[0]
            self.last_month = cal_date.text_content()
            self.last_day = cal_day.text_content()
        time = row.xpath("./td[@class='timeCell']")
        if not time:
            return  # Nada.
        time = time[0]
        time = time.text.strip()
        dt_string = "%s %s %s %s" % (
            self.last_month,
            self.last_day,
            self.year,
            time
        )
        fmt = "%b %d %Y %I:%M %p"
        when = dt.datetime.strptime(dt_string, fmt)
        cells = {
            "event": "eventCell",
            "status": "statusCell",
            "location": "locationCell",
            "transcript": "transcriptCell",
            "video": "videoCell"
        }
        metainf = {}
        for thing in cells:
            mi = row.xpath("./td[@class='" + cells[thing] + "']")
            if mi == []:
                continue
            metainf[thing] = mi[0]

        if metainf['location'].xpath("./*") == []:
            metainf['location'] = self.last_location
        else:
            self.last_location = metainf['location']

        if "Session" in metainf['event'].text_content().strip():
            return  # Nada.

        loc_url = metainf['location'].xpath(".//a")
        loc_url = loc_url[0].attrib['href']
        event = Event(session,
                      when,
                      'committee:meeting',
                      metainf['event'].text_content().strip(),
                      chamber=chamber,
                      location=metainf['location'].text_content().strip(),
                      location_url=loc_url)
        event.add_participant("host", metainf['event'].text_content().strip(),
                              'committee', chamber=chamber)
        self.add_agenda(event, metainf['event']
                        .xpath(".//a")[0].attrib['href'])
        return event
示例#26
0
    def scrape(self, session, chambers):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if time == "Noon":
                time = "12:00pm"

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            try:
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
            except ValueError:
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M %p")

            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            if desc.lower().strip() in ["house convenes", "senate convenes"]:
                continue

            event = Event(session,
                          when,
                          'committee:meeting',
                          desc,
                          location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            if "house" in desc.lower():
                chamber = "lower"
            elif "senate" in desc.lower():
                chamber = "upper"
            elif "joint" in desc.lower():
                chamber = "joint"
            else:
                self.logger.warning("Event %s chamber is unknown, skipping" %
                                    desc)
                continue

            event.add_participant('host', desc, 'committee', chamber=chamber)

            self.save_event(event)
示例#27
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [
                    committee,
                ]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids[committee]
                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host',
                                      committee['name'],
                                      'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name, notice_href, mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(bill['bill_id'],
                                       description=bill['descr'],
                                       type=bill['type'])

            self.save_event(event)
示例#28
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '')
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                                     '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(',') if s] + [None] * 3)[:3]
            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee_name,
                                  'committee',
                                  chamber='lower')
            event.add_document('Agenda',
                               guid,
                               type='agenda',
                               mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
示例#29
0
    def parse_row(self, row, session, chamber):
        dates = row.xpath("./td[@class='dateCell']")
        for date in dates:
            # alright, so we *may* not get a date, in which case the date
            # is the same as the last event.
            cal_date = date.xpath("./span[@class='calendarMonth']")[0]
            cal_day = date.xpath("./span[@class='calendarDay']")[0]
            self.last_month = cal_date.text_content()
            self.last_day = cal_day.text_content()
        time = row.xpath("./td[@class='timeCell']")
        if not time:
            return  # Nada.
        time = time[0]
        time = time.text.strip()
        dt_string = "%s %s %s %s" % (
            self.last_month,
            self.last_day,
            self.year,
            time
        )
        fmt = "%b %d %Y %I:%M %p"
        when = dt.datetime.strptime(dt_string, fmt)
        cells = {
            "event": "eventCell",
            "status": "statusCell",
            "location": "locationCell",
            "transcript": "transcriptCell",
            "video": "videoCell"
        }
        metainf = {}
        for thing in cells:
            mi = row.xpath("./td[@class='" + cells[thing] + "']")
            if mi == []:
                continue
            metainf[thing] = mi[0]

        if metainf['location'].xpath("./*") == []:
            metainf['location'] = self.last_location
        else:
            self.last_location = metainf['location']

        if "Session" in metainf['event'].text_content().strip():
            return  # Nada.

        loc_url = metainf['location'].xpath(".//a")
        loc_url = loc_url[0].attrib['href']
        event = Event(session,
                      when,
                      'committee:meeting',
                      metainf['event'].text_content().strip(),
                      chamber=chamber,
                      location=metainf['location'].text_content().strip(),
                      location_url=loc_url)
        event.add_participant("host", metainf['event'].text_content().strip(),
                              'committee', chamber=chamber)
        self.add_agenda(event, metainf['event'].xpath(".//a")[0].attrib['href'])
        return event
示例#30
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == "upper":
            chamber_abbrev = "S"
        else:
            chamber_abbrev = "H"

        url = (
            "http://www.legis.iowa.gov/Schedules/meetingsList"
            "Chamber.aspx?chamber=%s&bDate=%02d/%02d/"
            "%d&eDate=%02d/%02d/%d"
            % (
                chamber_abbrev,
                start_date.month,
                start_date.day,
                start_date.year,
                end_date.month,
                end_date.day,
                end_date.year,
            )
        )

        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        for link in page.xpath("//a[contains(@id, 'linkCommittee')]"):
            comm = link.text.strip()
            desc = comm + " Committee Hearing"
            location = link.xpath("string(../../td[3])")

            when = link.xpath("string(../../td[1])").strip()
            if when == "Cancelled" or "Upon" in when:
                continue
            if "To Be Determined" in when:
                continue

            if "AM" in when:
                when = when.split("AM")[0] + " AM"
            else:
                when = when.split("PM")[0] + " PM"

            junk = ["Reception"]
            for key in junk:
                when = when.replace(key, "")

            when = re.sub("\s+", " ", when).strip()
            when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session, when, "committee:meeting", desc, location)
            event.add_source(url)
            event.add_participant("host", comm, "committee", chamber=chamber)
            self.save_event(event)
示例#31
0
    def scrape(self, chamber, session):
        chmbr = cal_chamber_text[chamber]
        tables = self.url_xpath(cal_weekly_events,
                                "//table[@class='date-table']")
        for table in tables:
            date = table.xpath("../.")[0].getprevious().text_content()
            trs = table.xpath("./tr")
            for tr in trs:
                order = ["time", "chamber", "type", "agenda", "location",
                         "video"]

                tds = tr.xpath("./td")
                metainf = {}

                if not tds:
                    continue

                for el in range(0, len(order)):
                    metainf[order[el]] = tds[el]

                if metainf['chamber'].text_content() == chmbr:
                    self.log("Skipping event based on chamber.")
                    continue

                time = metainf['time'].text_content()
                datetime_string = "%s %s" % (date, time)
                location = metainf['location'].text_content()
                description = metainf['type'].text_content()

                dtfmt = "%A, %B %d, %Y %I:%M %p"
                if time == 'Cancelled':
                    self.log("Skipping cancelled event.")
                    continue
                else:
                    if ' Immediately follows' in datetime_string:
                        datetime_string, _ = datetime_string.split(
                            'Immediately follows')
                        datetime_string = datetime_string.strip()
                        dtfmt = "%A, %B %d, %Y"

                    when = dt.datetime.strptime(datetime_string, dtfmt)
                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_participant(
                    "host", description, 'committee', chamber=chamber)
                event.add_source(cal_weekly_events)

                agenda = metainf['agenda'].xpath(".//a")
                if len(agenda) > 0:
                    agenda = agenda
                    for doc in agenda:
                        if not doc.text_content():
                            continue
                        agenda_url = doc.attrib['href']
                        self.add_agenda(
                            agenda_url, doc.text_content(), event)
                self.save_event(event)
示例#32
0
    def parse_page(self, url, session):
        page = self.lxmlize(url)
        tables = page.xpath("//table[@class='pubhrgtbl']")
        date = None
        ctty = None
        chamber = 'other'
        for table in tables:
            metainf = {}
            rows = table.xpath(".//tr")
            for row in rows:
                tds = row.xpath("./*")
                if len(tds) < 2:
                    continue
                key, value = tds
                if key.tag == 'th':
                    date = key.text_content()
                    date = re.sub("\s+", " ", date)
                    date = re.sub(".*POSTPONED NEW DATE", "", date).strip()
                    ctty = value.xpath(".//strong")[0]
                    ctty = ctty.text_content()

                    chamber = 'other'
                    if "senate" in ctty.lower():
                        chamber = 'upper'
                    if "house" in ctty.lower():
                        chamber = 'lower'
                    if "joint" in ctty.lower():
                        chamber = 'joint'
                elif key.tag == 'td':
                    key = key.text_content().strip()
                    value = value.text_content().strip()
                    value = value.replace(u'\x96', '-')
                    value = re.sub("\s+", " ", value)
                    metainf[key] = value

            time = metainf['Time:']
            repl = {"A.M.": "AM", "P.M.": "PM"}
            for r in repl:
                time = time.replace(r, repl[r])

            time = re.sub("-.*", "", time)
            time = time.strip()

            year = dt.datetime.now().year

            date = "%s %s %s" % (date, year, time)
            datetime = dt.datetime.strptime(date, "%B %m %Y %I:%M %p")
            event = Event(session,
                          datetime,
                          'committee:meeting',
                          metainf['Public Hearing:'],
                          location=metainf['Place:'],
                          contact=metainf['Contact:'],
                          media_contact=metainf['Media Contact:'])
            event.add_source(url)
            event.add_participant('host', ctty, chamber=chamber)
            self.save_event(event)
示例#33
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == 'upper':
            chamber_abbrev = 'S'
        else:
            chamber_abbrev = 'H'

        url = ("http://www.legis.iowa.gov/Schedules/meetingsList"
               "Chamber.aspx?chamber=%s&bDate=%02d/%02d/"
               "%d&eDate=%02d/%02d/%d" %
               (chamber_abbrev, start_date.month, start_date.day,
                start_date.year, end_date.month, end_date.day, end_date.year))

        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)
        for link in page.xpath("//a[contains(@id, 'linkCommittee')]"):
            comm = link.text.strip()
            desc = comm + " Committee Hearing"
            location = link.xpath("string(../../td[3])")

            when = link.xpath("string(../../td[1])").strip()
            if 'cancelled' in when.lower() or "upon" in when.lower():
                continue
            if "To Be Determined" in when:
                continue

            if 'AM' in when:
                when = when.split('AM')[0] + " AM"
            else:
                when = when.split('PM')[0] + " PM"

            junk = ['Reception']
            for key in junk:
                when = when.replace(key, '')

            when = re.sub("\s+", " ", when).strip()
            if "tbd" in when.lower():
                # OK. This is a partial date of some sort.
                when = datetime.datetime.strptime(when,
                                                  "%m/%d/%Y TIME - TBD %p")
            else:
                try:
                    when = datetime.datetime.strptime(when,
                                                      "%m/%d/%Y %I:%M %p")
                except ValueError:
                    when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p")

            event = Event(session, when, 'committee:meeting', desc, location)
            event.add_source(url)
            event.add_participant('host', comm, 'committee', chamber=chamber)
            self.save_event(event)
示例#34
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                    '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]
            
            # check for time in date because of missing comma
            time_srch = re.search('\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session, when, 'committee:meeting',
                description, location=location)
            event.add_source(url)
            event.add_participant('host', committee_name, 'committee',
                chamber='lower')
            event.add_document('Agenda', guid, type='agenda',
                mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
示例#35
0
    def scrape(self, session, chambers):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join([place_and_time[0].strip(),place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            #when is now of the following format:
            #Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
            description = re.sub(r'\s+'," ", description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")
            

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'
        
            e = Event(session,when,event_type,description,location)

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    #no bills
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill,"").strip(", ").strip()
                ses,num = bill.split("-")
                bill = ses.replace(" ","")+"-"+num.zfill(4)
                if "PR" in bill or "CER" in bill:
                    e.add_related_bill(bill,type="resolution",description=bill_desc)
                else:
                    e.add_related_bill(bill,type="bill",description=bill_desc)

            e.add_source(calendar_url)

            if committee:
                e.add_participant("host",
                                  committee,
                                  'committee',
                                  chamber="upper")

            self.save_event(e)
示例#36
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session, when, 'committee:meeting', descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee,]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host', committee['name'], 'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(
                    bill['bill_id'],
                    description=bill['descr'],
                    type=bill['type']
                )

            self.save_event(event)
示例#37
0
    def scrape(self, chamber, session):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.iteritems():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=committee_name)
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                event.add_related_bill(bill_id,
                                       type=type_,
                                       description='consideration')

            event.add_participant('host',
                                  committee_name + ' Committee',
                                  'committee',
                                  chamber=chamber)
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            self.save_event(event)
示例#38
0
    def scrape(self, chamber, session):
        if chamber != "other":
            return None
        page = self.lxmlize(url)
        meetings = page.xpath("//div[@class='Comm_item']")
        for meeting in meetings:
            metas = meeting.xpath(".//b")
            ctty = meeting.xpath(".//a")[0]
            ctty_name = ctty.text_content()
            info = metas[1:]
            datetime = metas[0]
            metainf = {}
            for meta in info:
                header = meta.text_content().strip()
                val = meta.tail
                metainf[header] = val or ""
            datetime = datetime.text_content().strip()
            # Tuesday, June 05, 2012 9:00 AM
            if "Canceled" in datetime:
                continue

            formats = ["%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y"]
            date_time = None

            for fmt in formats:
                try:
                    date_time = dt.datetime.strptime(datetime, fmt)
                except ValueError:
                    pass

            if date_time is None:
                continue

            event = Event(chamber,
                          date_time,
                          'committee:meeting',
                          ctty_name,
                          location=metainf['Room:'] or "State House")
            event.add_source(url)

            chamber = "other"
            chambers = {
                "house": "lower",
                "joint": "joint",
                "senate": "upper",
            }
            for c in chambers:
                if c in ctty_name.lower():
                    chamber = chambers[c]

            event.add_participant('host', ctty_name, chamber=chamber)
            # add chair?

            self.save_event(event)
示例#39
0
    def scrape(self, chamber, session):
        cha = {"upper":"7","lower":"3","other":"4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now+timedelta(days=30)).strftime(print_format)
        url = event_page % (cha,start,end)

        page = self.lxmlize(url)

        committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading,content = meeting.xpath("./ul/li")
                who,when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p")
                

                kwargs = {
                    "location": (where or '').strip() or "unknown"
                }

                event = Event(session, when, 'committee:meeting',
                              meeting_title, **kwargs)
            
                event.add_participant(
                        "host",
                        who.strip(),
                        'committee',
                        chamber=chamber
                    )
                event.add_source(url)

                #only scraping public hearing bills for now.
                bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]")
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    event.add_related_bill(
                        bill_id.strip(),
                        description=descr.strip(),
                        type="consideration"
                    )


                self.save_event(event)
示例#40
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath(
            "//span[@class='heading']")[0].text_content().replace(
                "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      description,
                      location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
示例#41
0
    def scrape(self, session, chambers):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(session, comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          name,
                          location=where,
                          link=notice)

            event.add_source(calurl)
            event.add_participant('host',
                                  cttie,
                                  'committee',
                                  chamber=cttie_chamber)
            event.add_document("notice", notice, mimetype='application/pdf')

            for thing in who:
                event.add_participant(thing['title'],
                                      thing['name'],
                                      'legislator',
                                      chamber=cttie_chamber)

            self.save_event(event)
示例#42
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(
                r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)",
                when_and_where
            ).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session, when, 'committee:meeting',
                          description, location=location)
            event.add_source(url)
            event.add_participant('host', committee, 'committee',
                                  chamber='lower')
            event.add_document("Agenda", guid, type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill, description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#43
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../../td[1])").strip()

            when_and_where = link.xpath("string(../../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#44
0
    def scrape(self, session, chambers):
        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
            date = div.xpath("string(../../span[1])").strip()

            try:
                time, location = div.xpath("string(span[1])").split(',')
            except ValueError:
                # No meetings
                continue

            if time == "Noon":
                time = "12:00pm"

            if ':' not in time:
                self.warning('skipping event with invalid time: %s', time)
                continue
            when = "%s %s" % (date, time)
            try:
                when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p")
            except ValueError:
                when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p")

            when = self._tz.localize(when)

            desc = div.xpath("string(span[2])").strip()
            agenda = div.xpath("string(span[3])").strip()
            # XXX: Process `agenda' for related bills.
            if desc.lower().strip() in ["house convenes","senate convenes"]:
                continue

            event = Event(session, when, 'committee:meeting',
                          desc, location=location)
            event.add_source(url)

            # desc is actually the ctty name.
            if "house" in desc.lower():
                chamber = "lower"
            elif "senate" in desc.lower():
                chamber = "upper"
            elif "joint" in desc.lower():
                chamber = "joint"
            else:
                self.logger.warning("Event %s chamber is unknown, skipping" % desc)
                continue

            event.add_participant('host', desc, 'committee', chamber = chamber)

            self.save_event(event)
示例#45
0
 def scrape_committee_agendas(self, chamber, session):
     """
     Scrape upper or lower committee agendas
     """
     # could use &ShowAll=ON doesn't seem to work though
     url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                       self._chamber_short[chamber]
     with self.urlopen(url) as agendas:
         root = html.fromstring(agendas)
         if chamber == 'upper':
             event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                      '/td/table/tr/td/table')[0]
         else:
             event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                      '/td/table/tr/td/table/tr/td/table')[0]
         for row in event_table.xpath('tr')[2:]:
             # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
             # HTML Document, PDF Document for house
             # Agenda Date, Committee, Revised, Cancelled, Time, Room,
             # HTML Document, PDF Document for senate
             text = [ x.text_content().strip() for x in row.xpath('td') ]
             when, committee = text[0:2]
             if chamber == 'upper':
                 time, room = text[4:6]
                 link = row[6].xpath('string(a/@href)')
             else:
                 time, room = text[5:7]
                 link = row[7].xpath('string(a/@href)')
             if 'NOT MEETING' in time or 'CANCELLED' in time:
                 continue
             time = re.match('(\d+:\d+ (A|P))', time)
             if time:
                 when = "%s %sM" % (text[0], time.group(0))
                 when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
             else:
                 when = text[0]
                 when = datetime.datetime.strptime(when, '%m/%d/%Y')
                 
             when = self._tz.localize(when)
             
             title = "Committee Meeting:\n%s %s %s\n" % (
                                               self._chamber_long[chamber], 
                                               committee, room)
             (description, member_list, 
              meeting_type, other) = self.parse_agenda(chamber, link)
             event = Event(session, when, 'committee:meeting', title,
                           location=room, link=link, details=description)
             event.add_participant('committee', committee)
             event['participants'].extend(member_list)
             event.add_source(url)
             event.add_source(link)
             self.save_event(event)
示例#46
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self.initialize_committees(year_abr)
        url, db = self.get_dbf(year_abr, "AGENDAS")
        records = [ x.asDict() for x in db ]
        for record in records:
            if record['STATUS'] != "Scheduled":
                continue
            description = record['COMMENTS']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (
                record['DATE'],
                record['TIME']
            )
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['COMMHOUSE']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['LOCATION'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['COMMHOUSE'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['COMMHOUSE'],
                                  chamber=chamber)
            event.add_source(agenda_dbf)
            self.save_event(event)
示例#47
0
    def scrape_meeting_notice(self, chamber, session, url):
        page = self.lxmlize(url)
        bits = page.xpath("//td[@width='96%']/table/tr")
        metainf = {}
        for bit in bits:
            info = bit.xpath(".//td")
            key = info[0].text_content().strip()
            val = info[1].text_content().strip()
            if key[-1:] == ":":
                key = key[:-1]
            metainf[key] = val
        date_time_lbl = "Date/Time"
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%Y %I:%M:%S %p"
        metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl],
                                                     fmt)
        event = Event(session,
                      metainf[date_time_lbl],
                      "committee:meeting",
                      "Committee Meeting",
                      chamber=chambers[metainf['Chamber']],
                      location=metainf['Room'],
                      chairman=metainf['Chairman'])
        event.add_participant("host", metainf['Committee'],
                              chamber=chambers[metainf['Chamber']])
        event.add_source(url)

        agenda = page.xpath("//td[@width='96%']//font[@face='Arial']")
        agenda = [ a.text_content().strip() for a in agenda ]
        if "" in agenda:
            agenda.remove("")
        for item in agenda:
            string = item.split()
            string = string[:2]
            fChar = string[0][0]
            watch = [ "H", "S" ]
            if fChar in watch:
                try:
                    bNo = int(string[1])
                except ValueError:
                    continue
                except IndexError:
                    continue
                bill_id = "%s %s" % ( string[0], string[1] )
                event.add_related_bill(
                    bill_id,
                    description=item,
                    type="consideration"
                )

        self.save_event(event)
示例#48
0
    def scrape_meeting_notice(self, chamber, session, url):
        page = self.lxmlize(url)
        bits = page.xpath("//td[@width='96%']/table/tr")
        metainf = {}
        for bit in bits:
            info = bit.xpath(".//td")
            key = info[0].text_content().strip()
            val = info[1].text_content().strip()
            if key[-1:] == ":":
                key = key[:-1]
            metainf[key] = val
        date_time_lbl = "Date/Time"
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%Y %I:%M:%S %p"
        metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl],
                                                      fmt)
        event = Event(session,
                      metainf[date_time_lbl],
                      "committee:meeting",
                      "Committee Meeting",
                      chamber=chambers[metainf['Chamber']],
                      location=metainf['Room'],
                      chairman=metainf['Chairman'])
        event.add_participant("host",
                              metainf['Committee'],
                              'committee',
                              chamber=chambers[metainf['Chamber']])
        event.add_source(url)

        agenda = page.xpath("//td[@width='96%']//font[@face='Arial']")
        agenda = [a.text_content().strip() for a in agenda]
        if "" in agenda:
            agenda.remove("")
        for item in agenda:
            string = item.split()
            string = string[:2]
            fChar = string[0][0]
            watch = ["H", "S"]
            if fChar in watch:
                try:
                    bNo = int(string[1])
                except ValueError:
                    continue
                except IndexError:
                    continue
                bill_id = "%s %s" % (string[0], string[1])
                event.add_related_bill(bill_id,
                                       description=item,
                                       type="consideration")

        self.save_event(event)
示例#49
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self.initialize_committees(year_abr)
        url, db = self.get_dbf(year_abr, "AGENDAS")
        records = [ x.asDict() for x in db ]
        for record in records:
            if record['STATUS'] != "Scheduled":
                continue
            description = record['COMMENTS']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (
                record['DATE'],
                record['TIME']
            )
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['COMMHOUSE']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['LOCATION'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['COMMHOUSE'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['COMMHOUSE'],
                                  chamber=chamber)
            event.add_source(agenda_dbf)
            self.save_event(event)
示例#50
0
    def scrape(self, chamber, session):
        cha = {"upper": "7", "lower": "3", "other": "4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now + timedelta(days=30)).strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        committees = page.xpath(
            "//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath(
                "//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading, content = meeting.xpath("./ul/li")
                who, when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(),
                                            "%m/%d/%Y %I:%M:%S %p")

                kwargs = {"location": (where or '').strip() or "unknown"}

                event = Event(session, when, 'committee:meeting',
                              meeting_title, **kwargs)

                event.add_participant("host",
                                      who.strip(),
                                      'committee',
                                      chamber=chamber)
                event.add_source(url)

                #only scraping public hearing bills for now.
                bills = meeting.xpath(
                    ".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]"
                )
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    event.add_related_bill(bill_id.strip(),
                                           description=descr.strip(),
                                           type="consideration")

                self.save_event(event)
示例#51
0
    def scrape(self, chamber, session):
        chmbr = cal_chamber_text[chamber]
        tables = self.url_xpath(cal_weekly_events,
                                "//table[@class='date-table']")
        for table in tables:
            date = table.xpath("../.")[0].getprevious().text_content()
            trs = table.xpath("./tr")
            for tr in trs:
                order = ["time", "chamber", "type", "agenda", "location",
                         "video"]

                tds = tr.xpath("./td")
                metainf = {}

                if not tds:
                    continue

                for el in range(0, len(order)):
                    metainf[order[el]] = tds[el]

                if metainf['chamber'].text_content() == chmbr:
                    self.log("Skipping event based on chamber.")
                    continue

                time = metainf['time'].text_content()
                datetime_string = "%s %s" % (date, time)
                location = metainf['location'].text_content()
                description = metainf['type'].text_content()

                dtfmt = "%A, %B %d, %Y %I:%M %p"
                if time == 'Cancelled':
                    self.log("Skipping cancelled event.")
                    continue
                else:
                    when = dt.datetime.strptime(datetime_string, dtfmt)
                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_participant("host", description, 'committee', chamber=chamber)
                event.add_source(cal_weekly_events)

                agenda = metainf['agenda'].xpath(".//a")
                if len(agenda) > 0:
                    agenda = agenda
                    for doc in agenda:
                        if not doc.text_content():
                            continue
                        agenda_url = doc.attrib['href']
                        self.add_agenda(
                            agenda_url, doc.text_content(), event)
                self.save_event(event)
示例#52
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath("//span[@class='heading']")[0].text_content().replace(
            "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session, datetime, 'committee:meeting',
                      description, location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
示例#53
0
    def scrape_event_page(self, url, chamber, session):
        page = self.lxmlize(url)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {
                "txt": val.text_content().strip(),
                "obj": val
            }

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf['Date']['txt'],
            metainf['Time']['txt']
        )
        if "Cancelled" in datetime:
            return

        datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title


        event = Event(session, datetime, 'committee:meeting',
                      title, location=where)
        event.add_source(url)
        event.add_source(mi_events)

        event.add_participant('host', metainf['Committee']['txt'],
                              chamber=chamber)

        agenda = metainf['Agenda']['obj']
        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            event.add_related_bill(
                bill.text_content(),
                description=agenda.text_content(),
                type='consideration'
            )

        self.save_event(event)
示例#54
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainf = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainf[key.strip()] = val.strip()
        ctty = metainf['COMMITTEE']
        where = metainf['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainf['PLACE'] = where.strip()
            metainf['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainf:
            chair = metainf['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      ctty,
                      chamber=chamber,
                      location=where,
                      agenda=plaintext)
        event.add_source(url)
        event.add_participant('host', ctty, 'committee', chamber=chamber)
        if not chair is None:
            event.add_participant('chair',
                                  chair,
                                  'legislator',
                                  chamber=chamber)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            event.add_related_bill(bill_id,
                                   type='consideration',
                                   description='Bill up for discussion')

        self.save_event(event)
示例#55
0
    def scrape_committee_upcoming(self, session, chamber):
        chamber_name = {
            'upper': 'senate',
            'lower': 'house',
            'other': 'joint'
        }[chamber]
        url = ("http://www.capitol.state.tx.us/MyTLO/RSS/RSS.aspx?"
               "Type=upcomingmeetings%s" % chamber_name)

        with self.urlopen(url) as page:
            feed = feedparser.parse(page)

            for entry in feed['entries']:
                try:
                    title, date = entry['title'].split(' - ')
                except ValueError:
                    continue

                try:
                    time = re.match('Time: (\d+:\d+ (A|P)M)',
                                    entry['description']).group(1)
                except AttributeError:
                    # There are a few broken events in their feeds
                    # sometimes
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
                when = self._tz.localize(when)

                location = entry['description'].split('Location: ')[1]

                description = 'Committee Meeting\n'
                description += entry['title'] + '\n'
                description += entry['description']

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=location)
                event.add_participant('committee', title)

                event['_guid'] = entry['guid']
                event['link'] = entry['link']

                event.add_source(url)

                self.save_event(event)
示例#56
0
    def scrape(self, session, chambers):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath(
                '//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            event = Event(
                    session=session,
                    when=self._TZ.localize(datetime.datetime.strptime(
                            info.xpath('span[@class="col02"]/text()')[0],
                            self._DATETIME_FORMAT
                            )),
                    type='committee:meeting',
                    description=" ".join(x.strip() for x
                            in doc.xpath('//div[@class="schedule"]//text()')
                            if x.strip()),
                    location=doc.xpath(
                            '//div[@class="heading-container"]/span/text()')
                            [0].title()
                    )

            event.add_participant(
                    type='host',
                    participant=info.xpath(
                            'span[@class="col01"]/text()')[0].title(),
                    participant_type='committee'
                    )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=document.xpath('@href')[0]
                        )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            self.save_event(event)
示例#57
0
    def scrape_events(self, chamber, session, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        #some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)
                event_type = 'committee:meeting'
                kwargs = {"location": "State House"}
                if where is not None and where != "":
                    kwargs['location'] = where
                event = Event(session, when, event_type, description, **kwargs)

                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant('committee',
                                                  name,
                                                  'committee',
                                                  chamber=chamber)

                event.add_source(url)
                self.save_event(event)
        else:
            #hack so we dont fail on the first id numbers where there are some gaps between the numbers that work and not.
            if event_id > 1700:
                raise ScrapeError(
                    "Parsing is done we are on future ids that are not used yet."
                )
示例#58
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['CommHouse']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['Location'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['CommHouse'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['CommHouse'],
                                  chamber=chamber)
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')
            self.save_event(event)
示例#59
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['CommHouse']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['Location'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['CommHouse'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['CommHouse'],
                                  chamber=chamber)
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')
            self.save_event(event)
示例#60
0
    def scrape(self, session, chambers):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO.StringIO(page.content), delimiter='|')

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += page.next()
                assert (len(row) <= LINE_LENGTH,
                        "Line is too long: {}".format(row))

            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:
                comm_chamber = {'HOUSE': 'lower',
                                'SENATE': 'upper'}[match.group(2)]

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')

                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                event = Event(session, when, 'committee:meeting',
                              "%s MEETING" % comm,
                              location=location)
                event.add_source(url)

                event.add_participant('host', comm, 'committee',
                                      chamber=comm_chamber)

                time = row[3].strip()
                if time in TIMECODES:
                    event['notes'] = TIMECODES[time]

                self.save_event(event)