Exemplo n.º 1
0
    def scrape_page(self, url, session, chamber):
        try:
            page = self.lxmlize(url)
        except lxml.etree.XMLSyntaxError:
            self.warning("Ugh. Invalid HTML")
            return  # Ugh, invalid HTML.
        agendas = page.xpath("//td[@class='numberspace']")

        spans = page.xpath("//center/span")
        ctty = None
        date = None
        time = None
        if len(spans) >= 4:
            ctty = spans[0].text_content().strip()
            date = spans[2].text_content().strip()
            time = spans[3].text_content().strip()

        bills = []
        for agenda in agendas:
            number = agenda.text_content()
            string = agenda.getnext().text_content().strip()
            re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string)
            for bill in re_bills:
                bill_id = "%s%s %s" % bill
                bills.append({"name": bill_id, "desc": string})

        if ctty is None or date is None or time is None:
            return

        datetime = "%s %s" % (date.strip(), time.strip())
        datetime = re.sub("AGENDA", "", datetime).strip()
        datetime = [x.strip() for x in datetime.split("\r\n")]

        if "" in datetime:
            datetime.remove("")

        if len(datetime) == 1:
            datetime.append("state house")

        where = datetime[1]
        translate = {"a.m.": "AM", "p.m.": "PM"}
        for t in translate:
            datetime[0] = datetime[0].replace(t, translate[t])
        datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p")

        chamber = "other"
        cLow = ctty.lower()
        if "seante" in cLow:
            chamber = "upper"
        elif "house" in cLow:
            chamber = "lower"
        elif "joint" in cLow:
            chamber = "joint"

        event = Event(session, datetime, "committee:meeting", ctty, location=where)
        event.add_source(url)
        event.add_participant("host", ctty, "committee", chamber=chamber)
        for bill in bills:
            event.add_related_bill(bill["name"], description=bill["desc"], type="consideration")
        self.save_event(event)
Exemplo n.º 2
0
    def scrape_event(self, chamber, session, obj):
        meeting = obj['data']['meeting']
        date = int(meeting['meetingDateTime'])
        date = dt.datetime.fromtimestamp(date / 1000)
        if str(date.year) not in session:
            return
        description = 'Committee Meeting: ' + meeting['committeeName']
        event = Event(session,
                      date,
                      'committee:meeting',
                      description=description,
                      location=meeting['location'] or 'No location given.')
        event.add_source(obj['url'])
        event.add_participant('chair',
                              meeting['committeeChair'],
                              'legislator',
                              chamber='upper')
        event.add_participant('host',
                              meeting['committeeName'],
                              'committee',
                              chamber='upper')

        rgx = r'([a-z]+)(\d+)'
        for bill in meeting['bills']:
            raw_id = bill['senateBillNo']
            bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups())
            event.add_related_bill(bill_id,
                                   type='bill',
                                   description=bill['summary']
                                   or 'No description given.')
        return event
Exemplo n.º 3
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            if not when.endswith(session[:len("20XX")]):
                continue

            event = Event(session=session,
                          when=datetime.datetime.strptime(when, '%b %d, %Y'),
                          type='committee:meeting',
                          description=title,
                          location='State Capitol')
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                if committee_name.lower().startswith("Senate"):
                    chamber = "upper"
                elif committee_name.lower().startswith("House"):
                    chamber = "lower"
                else:
                    chamber = "joint"
                event.add_participant(type='host',
                                      participant=committee_name,
                                      participant_type='committee',
                                      chamber=chamber)

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)',
                                document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(name=document.xpath('text()')[0],
                                   url=url,
                                   mimetype='application/pdf')
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        event.add_related_bill(
                            bill_name,
                            type='consideration',
                            description='Bill up for discussion')

            self.save_event(event)
Exemplo n.º 4
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)",
                            when_and_where).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 5
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmt = "%A, %B %d, %Y"
        if time in all_day:
            datetime = date
        else:
            fmt += " %I:%M %p"
            datetime = "%s %s" % ( date, time )
        datetime = dt.datetime.strptime(datetime, fmt)

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, chamber=chamber)

        self.save_event(event)
Exemplo n.º 6
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [
                    committee,
                ]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids[committee]
                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host',
                                      committee['name'],
                                      'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name, notice_href, mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(bill['bill_id'],
                                       description=bill['descr'],
                                       type=bill['type'])

            self.save_event(event)
Exemplo n.º 7
0
    def scrape(self, session, chambers):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join([place_and_time[0].strip(),place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            #when is now of the following format:
            #Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
            description = re.sub(r'\s+'," ", description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")
            

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'
        
            e = Event(session,when,event_type,description,location)

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    #no bills
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill,"").strip(", ").strip()
                ses,num = bill.split("-")
                bill = ses.replace(" ","")+"-"+num.zfill(4)
                if "PR" in bill or "CER" in bill:
                    e.add_related_bill(bill,type="resolution",description=bill_desc)
                else:
                    e.add_related_bill(bill,type="bill",description=bill_desc)

            e.add_source(calendar_url)

            if committee:
                e.add_participant("host",
                                  committee,
                                  'committee',
                                  chamber="upper")

            self.save_event(e)
Exemplo n.º 8
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session, when, 'committee:meeting', descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee,]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host', committee['name'], 'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(
                    bill['bill_id'],
                    description=bill['descr'],
                    type=bill['type']
                )

            self.save_event(event)
Exemplo n.º 9
0
    def scrape(self, chamber, session):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.iteritems():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(session,
                          date,
                          'committee:meeting',
                          desc,
                          location=committee_name)
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                event.add_related_bill(bill_id,
                                       type=type_,
                                       description='consideration')

            event.add_participant('host',
                                  committee_name + ' Committee',
                                  'committee',
                                  chamber=chamber)
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            self.save_event(event)
Exemplo n.º 10
0
def test_event():
    e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting',
              'event description', 'event location')
    e.add_document('agenda', 'http://example.com/event/agenda.txt')
    e.add_related_bill('HB 1', relation='considered')
    assert_equal(e['documents'],
                 [{'name': 'agenda',
                   'url': 'http://example.com/event/agenda.txt',
                   'type': 'other'}])
    assert_equal(e['related_bills'],
                 [{'bill_id': 'HB 1', 'relation': 'considered'}])
Exemplo n.º 11
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(
                r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)",
                when_and_where
            ).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session, when, 'committee:meeting',
                          description, location=location)
            event.add_source(url)
            event.add_participant('host', committee, 'committee',
                                  chamber='lower')
            event.add_document("Agenda", guid, type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill, description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 12
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath(
            "//span[@class='heading']")[0].text_content().replace(
                "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      description,
                      location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
Exemplo n.º 13
0
    def scrape(self, chamber, session):
        cha = {"upper":"7","lower":"3","other":"4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now+timedelta(days=30)).strftime(print_format)
        url = event_page % (cha,start,end)

        page = self.lxmlize(url)

        committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading,content = meeting.xpath("./ul/li")
                who,when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p")
                

                kwargs = {
                    "location": (where or '').strip() or "unknown"
                }

                event = Event(session, when, 'committee:meeting',
                              meeting_title, **kwargs)
            
                event.add_participant(
                        "host",
                        who.strip(),
                        'committee',
                        chamber=chamber
                    )
                event.add_source(url)

                #only scraping public hearing bills for now.
                bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]")
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    event.add_related_bill(
                        bill_id.strip(),
                        description=descr.strip(),
                        type="consideration"
                    )


                self.save_event(event)
Exemplo n.º 14
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../../td[1])").strip()

            when_and_where = link.xpath("string(../../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 15
0
    def scrape_meeting_notice(self, chamber, session, url):
        page = self.lxmlize(url)
        bits = page.xpath("//td[@width='96%']/table/tr")
        metainf = {}
        for bit in bits:
            info = bit.xpath(".//td")
            key = info[0].text_content().strip()
            val = info[1].text_content().strip()
            if key[-1:] == ":":
                key = key[:-1]
            metainf[key] = val
        date_time_lbl = "Date/Time"
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%Y %I:%M:%S %p"
        metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl],
                                                      fmt)
        event = Event(session,
                      metainf[date_time_lbl],
                      "committee:meeting",
                      "Committee Meeting",
                      chamber=chambers[metainf['Chamber']],
                      location=metainf['Room'],
                      chairman=metainf['Chairman'])
        event.add_participant("host",
                              metainf['Committee'],
                              'committee',
                              chamber=chambers[metainf['Chamber']])
        event.add_source(url)

        agenda = page.xpath("//td[@width='96%']//font[@face='Arial']")
        agenda = [a.text_content().strip() for a in agenda]
        if "" in agenda:
            agenda.remove("")
        for item in agenda:
            string = item.split()
            string = string[:2]
            fChar = string[0][0]
            watch = ["H", "S"]
            if fChar in watch:
                try:
                    bNo = int(string[1])
                except ValueError:
                    continue
                except IndexError:
                    continue
                bill_id = "%s %s" % (string[0], string[1])
                event.add_related_bill(bill_id,
                                       description=item,
                                       type="consideration")

        self.save_event(event)
Exemplo n.º 16
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self.initialize_committees(year_abr)
        url, db = self.get_dbf(year_abr, "AGENDAS")
        records = [ x.asDict() for x in db ]
        for record in records:
            if record['STATUS'] != "Scheduled":
                continue
            description = record['COMMENTS']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (
                record['DATE'],
                record['TIME']
            )
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['COMMHOUSE']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['LOCATION'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['COMMHOUSE'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['COMMHOUSE'],
                                  chamber=chamber)
            event.add_source(agenda_dbf)
            self.save_event(event)
Exemplo n.º 17
0
    def scrape_meeting_notice(self, chamber, session, url):
        page = self.lxmlize(url)
        bits = page.xpath("//td[@width='96%']/table/tr")
        metainf = {}
        for bit in bits:
            info = bit.xpath(".//td")
            key = info[0].text_content().strip()
            val = info[1].text_content().strip()
            if key[-1:] == ":":
                key = key[:-1]
            metainf[key] = val
        date_time_lbl = "Date/Time"
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%Y %I:%M:%S %p"
        metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl],
                                                     fmt)
        event = Event(session,
                      metainf[date_time_lbl],
                      "committee:meeting",
                      "Committee Meeting",
                      chamber=chambers[metainf['Chamber']],
                      location=metainf['Room'],
                      chairman=metainf['Chairman'])
        event.add_participant("host", metainf['Committee'],
                              chamber=chambers[metainf['Chamber']])
        event.add_source(url)

        agenda = page.xpath("//td[@width='96%']//font[@face='Arial']")
        agenda = [ a.text_content().strip() for a in agenda ]
        if "" in agenda:
            agenda.remove("")
        for item in agenda:
            string = item.split()
            string = string[:2]
            fChar = string[0][0]
            watch = [ "H", "S" ]
            if fChar in watch:
                try:
                    bNo = int(string[1])
                except ValueError:
                    continue
                except IndexError:
                    continue
                bill_id = "%s %s" % ( string[0], string[1] )
                event.add_related_bill(
                    bill_id,
                    description=item,
                    type="consideration"
                )

        self.save_event(event)
Exemplo n.º 18
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self.initialize_committees(year_abr)
        url, db = self.get_dbf(year_abr, "AGENDAS")
        records = [ x.asDict() for x in db ]
        for record in records:
            if record['STATUS'] != "Scheduled":
                continue
            description = record['COMMENTS']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (
                record['DATE'],
                record['TIME']
            )
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['COMMHOUSE']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['LOCATION'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['COMMHOUSE'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['COMMHOUSE'],
                                  chamber=chamber)
            event.add_source(agenda_dbf)
            self.save_event(event)
Exemplo n.º 19
0
def test_event():
    e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting',
              'event description', 'event location')
    e.add_document('agenda', 'http://example.com/event/agenda.txt')
    e.add_related_bill('HB 1', relation='considered')
    assert_equal(e['documents'], [{
        'name': 'agenda',
        'url': 'http://example.com/event/agenda.txt',
        'type': 'other'
    }])
    assert_equal(e['related_bills'], [{
        'bill_id': 'HB 1',
        'relation': 'considered'
    }])
Exemplo n.º 20
0
    def scrape(self, chamber, session):
        cha = {"upper": "7", "lower": "3", "other": "4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now + timedelta(days=30)).strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        committees = page.xpath(
            "//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath(
                "//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading, content = meeting.xpath("./ul/li")
                who, when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(),
                                            "%m/%d/%Y %I:%M:%S %p")

                kwargs = {"location": (where or '').strip() or "unknown"}

                event = Event(session, when, 'committee:meeting',
                              meeting_title, **kwargs)

                event.add_participant("host",
                                      who.strip(),
                                      'committee',
                                      chamber=chamber)
                event.add_source(url)

                #only scraping public hearing bills for now.
                bills = meeting.xpath(
                    ".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]"
                )
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    event.add_related_bill(bill_id.strip(),
                                           description=descr.strip(),
                                           type="consideration")

                self.save_event(event)
Exemplo n.º 21
0
    def scrape_event_page(self, url, chamber, session):
        page = self.lxmlize(url)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {
                "txt": val.text_content().strip(),
                "obj": val
            }

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf['Date']['txt'],
            metainf['Time']['txt']
        )
        if "Cancelled" in datetime:
            return

        datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title


        event = Event(session, datetime, 'committee:meeting',
                      title, location=where)
        event.add_source(url)
        event.add_source(mi_events)

        event.add_participant('host', metainf['Committee']['txt'],
                              chamber=chamber)

        agenda = metainf['Agenda']['obj']
        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            event.add_related_bill(
                bill.text_content(),
                description=agenda.text_content(),
                type='consideration'
            )

        self.save_event(event)
Exemplo n.º 22
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainf = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainf[key.strip()] = val.strip()
        ctty = metainf['COMMITTEE']
        where = metainf['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainf['PLACE'] = where.strip()
            metainf['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainf:
            chair = metainf['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      ctty,
                      chamber=chamber,
                      location=where,
                      agenda=plaintext)
        event.add_source(url)
        event.add_participant('host', ctty, 'committee', chamber=chamber)
        if not chair is None:
            event.add_participant('chair',
                                  chair,
                                  'legislator',
                                  chamber=chamber)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            event.add_related_bill(bill_id,
                                   type='consideration',
                                   description='Bill up for discussion')

        self.save_event(event)
Exemplo n.º 23
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath("//span[@class='heading']")[0].text_content().replace(
            "Hearing Notice For ", "")
        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session, datetime, 'committee:meeting',
                      description, location=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant('host', ctty_name, 'committee', chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            event.add_related_bill(bill_id,
                                   description=description,
                                   type='consideration')

        self.save_event(event)
Exemplo n.º 24
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['CommHouse']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['Location'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['CommHouse'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['CommHouse'],
                                  chamber=chamber)
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')
            self.save_event(event)
Exemplo n.º 25
0
    def scrape(self, chamber, session):
        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall("(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id" : "%s %s" % ( bill[0], bill[2] ),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")
            hr_name = self._committees[record['CommHouse']]

            event = Event(
                session,
                date_time,
                'committee:meeting',
                "Meeting of the %s" % ( hr_name ),
                location=record['Location'] or "Statehouse",
            )
            for bill in related_bills:
                event.add_related_bill(bill['bill_id'],
                                      description=bill['descr'],
                                      type='consideration')
            try:
                chamber = {
                    "a" : "lower",
                    "s" : "upper",
                    "j" : "joint"
                }[record['CommHouse'][0].lower()]
            except KeyError:
                chamber = "joint"

            event.add_participant("host",
                                  hr_name,
                                  'committee',
                                  committee_code=record['CommHouse'],
                                  chamber=chamber)
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')
            self.save_event(event)
Exemplo n.º 26
0
    def scrape(self, chamber, session):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.iteritems():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bill_ids]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date),)
            assert len(set(hearing.committee_nr for hearing in hearings)
                       ) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(session, date, 'committee:meeting', desc,
                          location=committee_name)
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                event.add_related_bill(bill_id, type=type_,
                                       description='consideration')

            event.add_participant('host', committee_name + ' Committee',
                                  'committee', chamber=chamber)
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            self.save_event(event)
Exemplo n.º 27
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainf = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainf[key.strip()] = val.strip()
        ctty = metainf['COMMITTEE']
        where = metainf['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainf['PLACE'] = where.strip()
            metainf['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainf:
            chair = metainf['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      ctty,
                      chamber=chamber,
                      location=where,
                      agenda=plaintext)
        event.add_source(url)
        event.add_participant('host', ctty, 'committee', chamber=chamber)
        if not chair is None:
            event.add_participant(
                'chair', chair, 'legislator', chamber=chamber)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            event.add_related_bill(bill_id,
                                   type='consideration',
                                   description='Bill up for discussion')

        self.save_event(event)
Exemplo n.º 28
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session, when, 'committee:meeting',
                          description, location=location)
            event.add_source(url)
            event.add_participant('host', committee, 'committee',
                                  chamber='lower')
            event.add_document("Agenda", guid, type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill, description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
Exemplo n.º 29
0
    def scrape_page(self, url, session, chamber):
        page = self.lxmlize(url)

        ctty_name = page.xpath("//span[@class='heading']")[0].text_content()

        tables = page.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf["Location:"]
        description = ctty_name

        datetime = metainf["Scheduled Date:"]
        datetime = re.sub("\s+", " ", datetime)
        repl = {"AM": " AM", "PM": " PM"}  # Space shim.
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")

        event = Event(session, datetime, "committee:meeting", description, location=where)
        event.add_source(url)

        event.add_participant("host", ctty_name, chamber=chamber)

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            descr = tds[2].text_content()
            event.add_related_bill(bill_id, description=description, type="consideration")

        self.save_event(event)
Exemplo n.º 30
0
    def upper_scrape_event(self, chamber, session, obj):
        meeting = obj['data']['meeting']
        date = int(meeting['meetingDateTime'])
        date = dt.datetime.fromtimestamp(date / 1000)
        if str(date.year) not in session:
            return
        description = 'Committee Meeting: ' + meeting['committeeName']
        event = Event(session, date, 'committee:meeting',
                      description=description,
                      location=meeting['location'] or 'No location given.')
        event.add_source(obj['url'])
        event.add_participant('chair', meeting['committeeChair'],
                              'legislator', chamber='upper')
        event.add_participant('host', meeting['committeeName'],
                              'committee', chamber='upper')

        rgx = r'([a-z]+)(\d+)'
        for bill in meeting['bills']:
            raw_id = bill['senateBillNo']
            bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups())
            event.add_related_bill(
                bill_id, type='bill',
                description=bill['summary'] or 'No description given.')
        return event
Exemplo n.º 31
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        if time in all_day:
            datetime = date
        else:
            datetime = "%s %s" % ( date, time )
        if "CANCELLED" in datetime or "Rise of the House" in datetime:
            # XXX: Do something more advanced.
            return

        transtable = {
            "P.M" : "PM",
            "PM." : "PM",
            "P.M." : "PM",
            "A.M." : "AM",
            "POSTPONED" : "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, 'committee', chamber=chamber)

        self.save_event(event)
Exemplo n.º 32
0
    def scrape(self, chamber, session):

        cha = {"upper": "senate", "lower": "house", "other": "joint"}[chamber]

        print_format = "%m/%d/%Y"

        now = dt.datetime.now()
        start = now.strftime(print_format)
        then = now + timedelta(weeks=4)
        end = then.strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        def _split_tr(trs):
            ret = []
            cur = []
            for tr in trs:
                if len(tr.xpath(".//hr")) > 0:
                    ret.append(cur)
                    cur = []
                    continue
                cur.append(tr)
            if cur != []:
                ret.append(cur)
            return ret

        tables = page.xpath("//table[@class='AgendaCommittee']")
        for table in tables:
            # grab agenda, etc
            trs = table.xpath(".//tr")
            events = _split_tr(trs)
            for event in events:
                assert len(event) == 2
                header = event[0]
                body = event[1]
                whowhen = header.xpath(".//h2")[0].text_content()
                blocks = [x.strip() for x in whowhen.rsplit("-", 1)]
                who = blocks[0]
                when = blocks[1].replace(u'\xa0', ' ')
                if "TBA" in when:
                    continue  # XXX: Fixme

                cancel = \
                    body.xpath(".//span[@style='color:red;font-weight:bold']")

                if len(cancel) > 0:
                    cancel = True
                else:
                    cancel = False

                descr = body.xpath(".//*")
                flush = False
                where = body.xpath(".//br")[1].tail
                if where is not None:
                    where = where.strip()
                else:
                    where = "unknown"

                kwargs = {"location": where}

                if cancel:
                    kwargs['cancelled'] = cancel

                when = dt.datetime.strptime(when, "%m/%d/%y  %I:%M %p")

                meeting_title = "Scheduled Meeting"  # XXX: Fixme

                agenda = self.scrape_agenda(body.xpath(".//ol"))
                event = Event(session, when, 'committee:meeting',
                              meeting_title, **kwargs)
                event.add_participant("host",
                                      who,
                                      'committee',
                                      chamber=chamber)
                event.add_source(url)

                for item in agenda:
                    bill = item['bill']
                    descr = item['descr']
                    event.add_related_bill(bill,
                                           description=descr,
                                           type="consideration")
                self.save_event(event)
Exemplo n.º 33
0
    def scrape_event_page(self, url, chamber, session):
        page = self.lxmlize(url)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {
                "txt": val.text_content().strip(),
                "obj": val
            }

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf['Date']['txt'],
            metainf['Time']['txt']
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM"  # This is due to a nasty line they had.
        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub("\s+", " ", datetime)

        flag = "or after committees are given leave"

        if flag in datetime:
            datetime = datetime[:datetime.find(flag)].strip()

        datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title


        event = Event(session, datetime, 'committee:meeting',
                      title, location=where)
        event.add_source(url)
        event.add_source(mi_events)

        event.add_participant('host', metainf['Committee']['txt'],
                              chamber=chamber)

        agenda = metainf['Agenda']['obj']
        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            event.add_related_bill(
                bill.text_content(),
                description=agenda.text_content(),
                type='consideration'
            )

        self.save_event(event)
Exemplo n.º 34
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        with self.urlopen(url) as agendas:
            root = html.fromstring(agendas)
            if chamber == 'upper':
                event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                         '/td/table/tr/td/table')[0]
            else:
                event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                         '/td/table/tr/td/table/tr/td/table')[0]
            for row in event_table.xpath('tr')[2:]:
                # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
                # HTML Document, PDF Document for house
                # Agenda Date, Committee, Revised, Cancelled, Time, Room,
                # HTML Document, PDF Document for senate
                text = [ x.text_content().strip() for x in row.xpath('td') ]
                when, committee = text[0:2]
                if chamber == 'upper':
                    time, room = text[4:6]
                    link = row[6].xpath('string(a/@href)')
                else:
                    time, room = text[5:7]
                    link = row[7].xpath('string(a/@href)')
                if 'NOT MEETING' in time or 'CANCELLED' in time:
                    continue
                time = re.match('(\d+:\d+ (A|P))', time)
                if time:
                    when = "%s %sM" % (text[0], time.group(0))
                    when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
                else:
                    when = text[0]
                    when = datetime.datetime.strptime(when, '%m/%d/%Y')

                when = self._tz.localize(when)

                title = "Committee Meeting:\n%s %s %s\n" % (
                                                  self._chamber_long[chamber],
                                                  committee, room)
                agenda_info = self.parse_agenda(chamber, link)

                description = agenda_info['description']
                member_list = agenda_info['member_list']
                meeting_type = agenda_info['meeting_type']
                agenda_items = agenda_info['agenda_items']
                related_bills= agenda_info['related_bills']
                other = agenda_info['other']

                event = Event(session, when, 'committee:meeting', title,
                              location=room, link=link, details=description) #,
                              #agenda=agenda_items)
                event.add_participant('committee', committee, 'committee',
                                      chamber=chamber)

                for i in range(0, len(related_bills)):
                    bill = related_bills[i]
                    desc = description[i]
                    event.add_related_bill(
                        bill,
                        description=desc,
                        type="consideration"
                    )

                event['participants'].extend(member_list)
                event.add_source(url)
                event.add_source(link)
                self.save_event(event)
Exemplo n.º 35
0
    def scrape_page(self, url, session, chamber):
        try:
            page = self.lxmlize(url)
        except lxml.etree.XMLSyntaxError:
            self.warning("Ugh. Invalid HTML")
            return  # Ugh, invalid HTML.
        agendas = page.xpath("//td[@class='numberspace']")

        spans = page.xpath("//center/span")
        ctty = None
        date = None
        time = None
        if len(spans) >= 4:
            ctty = spans[0].text_content().strip()
            date = spans[2].text_content().strip()
            time = spans[3].text_content().strip()

        bills = []
        for agenda in agendas:
            number = agenda.text_content()
            string = agenda.getnext().text_content().strip()
            re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string)
            for bill in re_bills:
                bill_id = '%s%s %s' % bill
                bills.append({'name': bill_id, 'desc': string})

        if ctty is None or date is None or time is None:
            return

        datetime = "%s %s" % (date.strip(), time.strip())
        datetime = re.sub("AGENDA", "", datetime).strip()
        datetime = [x.strip() for x in datetime.split("\r\n")]

        if "" in datetime:
            datetime.remove("")

        if len(datetime) == 1:
            datetime.append("state house")

        where = datetime[1]
        translate = {"a.m.": "AM", "p.m.": "PM"}
        for t in translate:
            datetime[0] = datetime[0].replace(t, translate[t])
        datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p")

        chamber = 'other'
        cLow = ctty.lower()
        if "seante" in cLow:
            chamber = 'upper'
        elif "house" in cLow:
            chamber = 'lower'
        elif "joint" in cLow:
            chamber = 'joint'

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      ctty,
                      location=where)
        event.add_source(url)
        event.add_participant('host', ctty, 'committee', chamber=chamber)
        for bill in bills:
            event.add_related_bill(bill['name'],
                                   description=bill['desc'],
                                   type='consideration')
        self.save_event(event)
Exemplo n.º 36
0
    def scrape(self, chamber, session):
        url = 'http://leg.colorado.gov/content/committees'

        if chamber == 'lower':
            xpath = '//div/h3[text()="House Committees of Reference"]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'
        elif chamber == 'upper':
            xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'
        elif chamber == 'other':
            # All the links under the headers that don't contain "House" or "Senate"
            xpath = '//div/h3[not(contains(text(),"House")) and ' \
                    'not(contains(text(),"Senate"))]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'

        page = self.lxmlize(url)
        com_links = page.xpath(xpath)

        for link in com_links:
            page = self.lxmlize(link)

            hearing_links = page.xpath('//div[contains(@class,"schedule-item-content")]/h4/a/@href')

            for link in hearing_links:
                page = self.lxmlize(link)

                title = page.xpath('//header/h1[contains(@class,"node-title")]')[0]
                title = title.text_content().strip()

                date_day = page.xpath('//div[contains(@class,"calendar-date")]')[0]
                date_day = date_day.text_content().strip()

                details = page.xpath('//span[contains(@class, "calendar-details")]')[0]
                details = details.text_content().split('|')

                date_time = details[0].strip()
                location = details[1].strip()

                if 'Upon Adjournment' in date_time:
                    date = dt.datetime.strptime(date_day, '%A %B %d, %Y')
                else:
                    date_str = '{} {}'.format(date_day, date_time)
                    date = dt.datetime.strptime(date_str, '%A %B %d, %Y %I:%M %p')

                agendas = []
                # they overload the bills table w/ other agenda items. colspon=2 is agenda
                non_bills = page.xpath('//td[@data-label="Hearing Item" and @colspan="2"]')
                for row in non_bills:
                    content = row.text_content().strip()
                    agendas.append(content)

                agenda = "\n".join(agendas) if agendas else ''

                event = Event(session, date, "committee:meeting", title, location, agenda=agenda)
                event.add_source(link)

                bills = page.xpath('//td[@data-label="Hearing Item"]/a')
                for bill in bills:
                    bill_id = bill.text_content().strip()

                    event.add_related_bill(
                        bill_id,
                        description="hearing item",
                        type="consideration"
                    )

                self.save_event(event)
Exemplo n.º 37
0
    def scrape_upper(self, session):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text')
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip().startswith("Page ")
                            and not x.strip().startswith("*Possible Vote") and
                            not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                        ]).decode('ascii', 'ignore')

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(session=session,
                                      when=time,
                                      type='committee:meeting',
                                      description=description,
                                      location=location)

                        event.add_source(PDF_URL)
                        event.add_participant(type='host',
                                              participant=comm,
                                              participant_type='committee',
                                              chamber='upper')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                event.add_related_bill(bill_id=related_bill,
                                                       type='consideration',
                                                       description=relation)

                        self.save_event(event)
Exemplo n.º 38
0
    def scrape(self, chamber, session):
        if session != '28':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year = now.year

        # Full calendar year
        date1 = '0101' + str(year)[2:]
        date2 = '1231' + str(year)[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (
                   session, date1, date2))

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
        for font in page.xpath(path):
            match = re.match(r'^\((H|S)\)(.+)$', font.text)

            chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
            comm = match.group(2).strip().title()

            next_row = font.xpath("../../following-sibling::tr[1]")[0]

            when = next_row.xpath("string(td[1]/font)").strip()
            when = re.sub("\s+", " ", when)
            when = "%s %s" % (when, year)

            continu = False
            for slug in exclude_slugs:
                if slug in when:
                    continu = True

            for repl in replacements:
                if repl in when:
                    when = when.replace(repl, replacements[repl])

            if continu:
                continue

            parsed_when = None
            for fmt in formats:
                try:
                    parsed_when = datetime.datetime.strptime(when, fmt)
                    break
                except ValueError:
                    pass

            if not parsed_when:
                raise

            when = parsed_when
            if when < now:
                self.warning("Dropping an event at %s. Be careful!" % (
                    when
                ))
                continue

            when = self._tz.localize(when)

            where = next_row.xpath("string(td[2]/font)").strip()

            description = "Committee Meeting\n"
            description += comm

            links = font.xpath(
                "../../td/font/a[contains(@href, 'get_documents')]")
            if links:
                agenda_link = links[0]
                event['link'] = agenda_link.attrib['href']

            cur_node = font.getparent().getparent()
            bills = []
            while cur_node is not None and cur_node.xpath(".//hr") == []:
                bills += cur_node.xpath(
                    ".//a[contains(@href, 'get_complete_bill')]/text()")
                cur_node = cur_node.getnext()

            event = Event(session, when, 'committee:meeting',
                          description, location=where)

            event.add_source(url)
            for bill in bills:
                event.add_related_bill(bill,
                                       description='Related Bill',
                                       type='consideration')

            event.add_participant('host',
                                  comm,
                                  participant_type='committee',
                                  chamber=chamber)
            self.save_event(event)
Exemplo n.º 39
0
    def scrape_meeting(self, session, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title ,= page.xpath("//a[@id='linkTitle']//text()")
        date ,= page.xpath("//span[@id='lDate']/text()")
        time ,= page.xpath("//span[@id='lTime']/text()")
        location ,= page.xpath("//span[@id='lLocation']/text()")

        substs = {
            "AM": ["A.M.", "a.m."],
            "PM": ["P.M.", "p.m.", "Noon"],
        }

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r'(?i)\d[AP]M$', time):
            time = time[:-2] + " " + time[-2:]

        if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()):
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = datetime.datetime.strptime("%s %s" % (
                date, time
            ), "%B %d, %Y %I:%M %p")

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {"house": "lower",
                    "senate": "upper",
                    "joint": "joint",}

        for chamber_, normalized in chambers.items():
            if chamber_ in title.lower():
                chamber = normalized
                break
        else:
            return

        event = Event(
            session,
            when,
            'committee:meeting',
            description,
            location=location,
            all_day=all_day
        )
        event.add_source(url)

        event.add_participant('host', title, 'committee',
                              chamber=chamber)

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title:
                bill_chamber = "upper"
            elif "H" in bill_title:
                bill_chamber = "lower"
            else:
                continue

            event.add_related_bill(bill_id=bill_title,
                                   description=descr.text_content(),
                                   chamber=bill_chamber,
                                   type='consideration')
        self.save_event(event)
Exemplo n.º 40
0
    def scrape(self, chamber, session):
        url = 'http://leg.colorado.gov/content/committees'

        if chamber == 'lower':
            xpath = '//div/h3[text()="House Committees of Reference"]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'
        elif chamber == 'upper':
            xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'
        elif chamber == 'other':
            # All the links under the headers that don't contain "House" or "Senate"
            xpath = '//div/h3[not(contains(text(),"House")) and ' \
                    'not(contains(text(),"Senate"))]/../' \
                    'following-sibling::div[contains(@class,"view-content")]/' \
                    'table//td//span[contains(@class,"field-content")]/a/@href'

        page = self.lxmlize(url)
        com_links = page.xpath(xpath)

        for link in com_links:
            page = self.lxmlize(link)

            hearing_links = page.xpath(
                '//div[contains(@class,"schedule-item-content")]/h4/a/@href')

            for link in hearing_links:
                page = self.lxmlize(link)

                title = page.xpath(
                    '//header/h1[contains(@class,"node-title")]')[0]
                title = title.text_content().strip()

                date_day = page.xpath(
                    '//div[contains(@class,"calendar-date")]')[0]
                date_day = date_day.text_content().strip()

                details = page.xpath(
                    '//span[contains(@class, "calendar-details")]')[0]
                details = details.text_content().split('|')

                date_time = details[0].strip()
                location = details[1].strip()

                if 'Upon Adjournment' in date_time:
                    date = dt.datetime.strptime(date_day, '%A %B %d, %Y')
                else:
                    date_str = '{} {}'.format(date_day, date_time)
                    date = dt.datetime.strptime(date_str,
                                                '%A %B %d, %Y %I:%M %p')

                agendas = []
                # they overload the bills table w/ other agenda items. colspon=2 is agenda
                non_bills = page.xpath(
                    '//td[@data-label="Hearing Item" and @colspan="2"]')
                for row in non_bills:
                    content = row.text_content().strip()
                    agendas.append(content)

                agenda = "\n".join(agendas) if agendas else ''

                event = Event(session,
                              date,
                              "committee:meeting",
                              title,
                              location,
                              agenda=agenda)
                event.add_source(link)

                bills = page.xpath('//td[@data-label="Hearing Item"]/a')
                for bill in bills:
                    bill_id = bill.text_content().strip()

                    event.add_related_bill(bill_id,
                                           description="hearing item",
                                           type="consideration")

                self.save_event(event)
Exemplo n.º 41
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        calendar_url = "http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)

        page = self.lxmlize(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = "%02d" % (i + 2)

            date_xpath = './/span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata["chambers"][chamber]["name"][0].upper()
            meeting_xpath = './/a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % (
                str(row_ident),
                chamber_char,
            )
            meeting_url = row.xpath(meeting_xpath)

            if len(meeting_url) == 1 and meeting_url[0].text_content().strip() != "":
                try:
                    meeting_url = meeting_url[0].attrib["href"]
                except KeyError:
                    self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip())
                    continue

                meeting_page = self.lxmlize(meeting_url)
                meetings = meeting_page.xpath('.//table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [meeting_ident, meeting_idents[i + 1] - 1]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + " " + meeting_time, "%m/%d/%Y %I:%M %p"
                    )
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == "":
                        description = committee

                    event = Event(session, meeting_date_time, "committee:meeting", description, location)

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill["bill_description"] == "":
                            bill["bill_description"] = committee

                        event.add_related_bill(
                            bill_id=bill["bill_id"], description=bill["bill_description"], type="consideration"
                        )
                        event.add_document(
                            name=bill["bill_id"], url=bill["bill_url"], type="bill", mimetype="application/pdf"
                        )

                    event.add_participant(
                        type="host", participant=committee, participant_type="committee", chamber=chamber
                    )

                    self.save_event(event)
Exemplo n.º 42
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % (
            self.metadata['chambers'][chamber]['name'].upper()[0])
        page = self.get_page_from_url(events_url)

        meeting_year = page.xpath(
            '//h2[@class="barheader"]/span')[0].text_content()
        meeting_year = re.search(
            r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})',
            meeting_year).group(1)

        dates = page.xpath("//div[@id='contentsection']/ul")

        for date in dates:
            date_string = date.xpath('span')

            if len(date_string) == 1:
                date_string = date_string[0].text_content()
            else:
                continue

            # If a event is in the next calendar year, the date_string
            # will have a year in it
            if date_string.count(",") == 2:
                event_year = date_string[-4:]
                date_string = date_string[:-6]
            elif date_string.count(",") == 1:
                event_year = meeting_year
            else:
                raise AssertionError("This is not a valid date: '{}'").\
                        format(date_string)

            for meeting in date.xpath('li'):
                time_string = meeting.xpath('span')[0].text_content()

                if time_string == 'CANCELED' or len(
                        meeting.xpath(
                            './/span[contains(text(), "CANCELED")]')) > 0:
                    continue

                time_string = self.normalize_time(time_string)
                date_time = datetime.datetime.strptime(
                    event_year + ' ' + date_string + ' ' + time_string,
                    "%Y %A, %B %d %I:%M %p")

                date_time = self._tz.localize(date_time)
                meeting_info = meeting.xpath(
                    'br[1]/preceding-sibling::node()')[1]
                location, description = re.search(r'-- (.*?) -- (.*)',
                                                  meeting_info).groups()

                if re.search(r'committee', description, re.I):
                    meeting_type = 'committee:meeting'
                else:
                    meeting_type = 'other:meeting'

                event = Event(session, date_time, meeting_type, description,
                              location)
                event.add_source(events_url)

                agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]")

                if agenda_url:
                    agenda_url = agenda_url[0].attrib['href']
                    event.add_source(agenda_url)
                    agenda_page = self.get_page_from_url(agenda_url)

                    for bill in agenda_page.xpath(
                            ".//a[contains(@href,'billsearch.php')]"):
                        bill_url = bill.attrib['href']
                        bill_id = bill.text_content().replace('.', '').replace(
                            ' ', '')
                        bill_description = self.get_bill_description(bill_url)

                        event.add_related_bill(bill_id=bill_id,
                                               type='consideration',
                                               description=bill_description)
                self.save_event(event)
Exemplo n.º 43
0
    def scrape(self, chamber, session):
        if chamber == 'upper':
            url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM"
        elif chamber == 'lower':
            url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM"
        else:
            return

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for date_td in page.xpath("//td[@valign='middle']"):
            date = date_td.text_content().strip()

            datetime.datetime.strptime(
                date, "%A, %B %d, %Y").date()

            next_tr = date_td.getparent().getnext()
            while next_tr is not None:
                if next_tr.xpath("td[@valign='middle']"):
                    break

                time = next_tr.xpath("string(td[1])").strip()
                dt = "%s %s" % (date, time)

                try:
                    dt = datetime.datetime.strptime(
                        dt, "%A, %B %d, %Y %I:%M %p")
                    dt = self._tz.localize(dt)
                except ValueError:
                    break

                desc = next_tr.xpath("string(td[2])").strip()
                desc_el = next_tr.xpath("td[2]")[0]
                desc = re.sub(r'\s+', ' ', desc)

                ctty = None
                cttyraw = desc.split("COMMITTEE", 1)
                if len(cttyraw) > 1:
                    ctty = cttyraw[0]

                related_bills = desc_el.xpath(
                    ".//a[contains(@href, 'billinfo')]")
                bills = []
                urls = [x.attrib['href'] for x in related_bills]

                for u in urls:
                    o = urlparse.urlparse(u)
                    qs = urlparse.parse_qs(o.query)
                    bills.append({
                        "bill_id": "%sB %s" % (qs['body'][0], qs['bn'][0]),
                        "bill_num": qs['bn'][0],
                        "bill_chamber": qs['body'][0],
                        "session": qs['syear'][0],
                        "descr": desc
                    })

                location = next_tr.xpath("string(td[3])").strip()
                location = re.sub(r'\s+', ' ', location)

                event = Event(session, dt, 'committee:meeting',
                              desc, location)
                event.add_source(url)

                if not ctty is None:
                    event.add_participant('host', ctty, 'committee',
                                          chamber=chamber)

                for bill in bills:
                    event.add_related_bill(
                        bill['bill_id'],
                        description=bill['descr'],
                        type='consideration'
                    )
                self.save_event(event)
                next_tr = next_tr.getnext()
Exemplo n.º 44
0
    def scrape(self, chamber, session):
        if chamber != 'other':
            return None  # We're going to do it all on one shot.

        if session[-2:] == "s1":
            return None  # Special sessions 404

        url = "http://mlis.state.md.us/%s/hearsch/alladd.htm" % (session)
        page = self.lxmlize(url)
        events = page.xpath("//pre")
        for event in events:
            ctty_name = [
                x.strip()
                for x in event.getparent().getprevious().text_content().split(
                    "-", 1)
            ]
            ctty_name = ctty_name[0]
            event_text = event.text_content()
            if "This meeting has been cancelled." in event_text:
                continue
            # OK. In order to process this text-only notice, we have to resort
            # to some major hackage. Just roll with it.
            lines = event_text.split("\n")
            # In order to get the key stuff, we need to figure out where the
            # address "block" starts.
            address_block = last_space(lines[4])
            assert address_block is not None
            # OK. Given the offset, we can "split" the time off the date block.
            time_room = lines[3]
            time = time_room[:address_block].strip()

            if "TBD" in time:
                continue  # Nothing's set yet.
            time = "%s %s" % (lines[1], time)
            time = re.sub("\s+", " ", time).strip()
            trans = {"P.M.": "PM", "A.M.": "AM"}
            for transition in trans:
                time = time.replace(transition, trans[transition])

            when = dt.datetime.strptime(time, "%A %B %d, %Y %I:%M %p")

            room = time_room[address_block:].strip()
            place_block = lines[4:]
            where = room + "\n"
            done = False
            offset = 4
            for place in place_block:
                if place.strip() == "":
                    done = True
                if done:
                    continue
                offset += 1
                where += place.strip() + "\n"
            where = where.strip()
            # Now that the date's processed, we can move on.
            moreinfo = lines[offset + 1:]
            info = {}
            key = "unattached_header"
            for inf in moreinfo:
                if ":" in inf:
                    key, value = inf.split(":", 1)
                    key = key.strip()
                    info[key] = value.strip()
                else:
                    info[key] += " " + inf.strip()
            # Alright. We should have enough now.
            subject = info['Subject']

            event = Event(session,
                          when,
                          'committee:meeting',
                          subject,
                          location=where)
            event.add_source(url)

            flags = {"joint": "joint", "house": "lower", "senate": "upper"}
            chamber = "other"
            for flag in flags:
                if flag in ctty_name.lower():
                    chamber = flags[flag]

            # Let's try and hack out some bill names.
            trans = {
                "SENATE": "S",
                "HOUSE": "H",
                "JOINT": "J",
                "BILL": "B",
                "RESOLUTION": "R",
            }
            _t_subject = subject.upper()
            for t in trans:
                regex = "%s(\s+)?" % t
                _t_subject = re.sub(regex, trans[t], _t_subject)
            print _t_subject
            bills = re.findall("(S|H)(J)?(B|R|M)\s*(\d{4})", _t_subject)
            for bill in bills:
                name = bill[:3]
                bid = bill[3]
                bill_id = "%s %s" % (''.join(name), bid)
                event.add_related_bill(bill_id,
                                       description=subject,
                                       type='consideration')

            event.add_participant("host", ctty_name, chamber=chamber)

            self.save_event(event)
Exemplo n.º 45
0
    def scrape_meeting(self, session, chamber, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title, = page.xpath("//a[@id='linkTitle']//text()")
        date, = page.xpath("//span[@id='lDate']/text()")
        time, = page.xpath("//span[@id='lTime']/text()")
        location, = page.xpath("//span[@id='lLocation']/text()")

        if ("UPON ADJOURNMENT" in time.upper()
                or "UPON  ADJOURNMENT" in time.upper()):
            return

        substs = {
            "AM": ["A.M.", "a.m."],
            "PM": ["P.M.", "p.m."],
        }

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        try:
            when = datetime.datetime.strptime("%s %s" % (date, time),
                                              "%B %d, %Y %I:%M %p")
        except ValueError:
            when = datetime.datetime.strptime("%s %s" % (date, time),
                                              "%B %d, %Y %I:%M")

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {
            "house": "lower",
            "senate": "upper",
            "joint": "joint",
        }

        for chamber_, normalized in chambers.items():
            if chamber_ in title.lower():
                chamber = normalized
                break
        else:
            return

        event = Event(session,
                      when,
                      'committee:meeting',
                      description,
                      location=location)
        event.add_source(url)

        event.add_participant('host', title, 'committee', chamber=chamber)

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title:
                bill_chamber = "upper"
            elif "H" in bill_title:
                bill_chamber = "lower"
            else:
                continue

            event.add_related_bill(bill_id=bill_title,
                                   description=descr.text_content(),
                                   chamber=bill_chamber,
                                   type='consideration')
        self.save_event(event)
Exemplo n.º 46
0
    def scrape(self, chamber, session):
        if chamber == 'upper':
            url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM"
        elif chamber == 'lower':
            url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM"
        else:
            return

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for date_td in page.xpath("//td[@valign='middle']"):
            date = date_td.text_content().strip()

            datetime.datetime.strptime(date, "%A, %B %d, %Y").date()

            next_tr = date_td.getparent().getnext()
            while next_tr is not None:
                if next_tr.xpath("td[@valign='middle']"):
                    break

                time = next_tr.xpath("string(td[1])").strip()
                dt = "%s %s" % (date, time)

                try:
                    dt = datetime.datetime.strptime(dt,
                                                    "%A, %B %d, %Y %I:%M %p")
                    dt = self._tz.localize(dt)
                except ValueError:
                    break

                desc = next_tr.xpath("string(td[2])").strip()
                desc_el = next_tr.xpath("td[2]")[0]
                desc = re.sub(r'\s+', ' ', desc)

                ctty = None
                cttyraw = desc.split("COMMITTEE", 1)
                if len(cttyraw) > 1:
                    ctty = cttyraw[0]

                related_bills = desc_el.xpath(
                    ".//a[contains(@href, 'billinfo')]")
                bills = []
                urls = [x.attrib['href'] for x in related_bills]

                for u in urls:
                    o = urlparse.urlparse(u)
                    qs = urlparse.parse_qs(o.query)
                    bills.append({
                        "bill_id":
                        "%sB %s" % (qs['body'][0], qs['bn'][0]),
                        "bill_num":
                        qs['bn'][0],
                        "bill_chamber":
                        qs['body'][0],
                        "session":
                        qs['syear'][0],
                        "descr":
                        desc
                    })

                location = next_tr.xpath("string(td[3])").strip()
                location = re.sub(r'\s+', ' ', location)

                event = Event(session, dt, 'committee:meeting', desc, location)
                event.add_source(url)

                if not ctty is None:
                    event.add_participant('host',
                                          ctty,
                                          'committee',
                                          chamber=chamber)

                for bill in bills:
                    event.add_related_bill(bill['bill_id'],
                                           description=bill['descr'],
                                           type='consideration')
                self.save_event(event)
                next_tr = next_tr.getnext()
Exemplo n.º 47
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        if time in all_day:
            datetime = date
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime:
            # XXX: Do something more advanced.
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      'Meeting Notice',
                      location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(),
                               bill_ft,
                               type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
            ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {"house": "lower", "joint": "joint", "senate": "upper"}
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, 'committee', chamber=chamber)

        self.save_event(event)
Exemplo n.º 48
0
    def scrape_meeting(self, session, chamber, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title ,= page.xpath("//a[@id='linkTitle']//text()")
        date ,= page.xpath("//span[@id='lDate']/text()")
        time ,= page.xpath("//span[@id='lTime']/text()")
        location ,= page.xpath("//span[@id='lLocation']/text()")

        if time == "UPON ADJOURNMENT":
            return

        if "A.M." in time:
            time = time.replace("A.M.", "AM")

        try:
            when = datetime.datetime.strptime("%s %s" % (
                date, time
            ), "%B %d, %Y %I:%M %p")
        except ValueError:
            when = datetime.datetime.strptime("%s %s" % (
                date, time
            ), "%B %d, %Y %I:%M")

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {"house": "lower",
                    "senate": "upper",
                    "joint": "joint",}

        for chamber_, normalized in chambers.items():
            if chamber_ in title.lower():
                chamber = normalized
                break
        else:
            return

        event = Event(
            session,
            when,
            'committee:meeting',
            description,
            location=location
        )
        event.add_source(url)

        event.add_participant('host', title, 'committee',
                              chamber=chamber)

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title:
                bill_chamber = "upper"
            elif "H" in bill_title:
                bill_chamber = "lower"
            else:
                continue

            event.add_related_bill(bill_id=bill_title,
                                   description=descr.text_content(),
                                   chamber=bill_chamber,
                                   type='consideration')
            self.save_event(event)
Exemplo n.º 49
0
    def scrape(self, chamber, session):
        if session != '28':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year = now.year

        # Full calendar year
        date1 = '0101' + str(year)[2:]
        date2 = '1231' + str(year)[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (
                   session, date1, date2))

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
        for font in page.xpath(path):
            match = re.match(r'^\((H|S)\)(.+)$', font.text)

            chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
            comm = match.group(2).strip().title()

            next_row = font.xpath("../../following-sibling::tr[1]")[0]

            when = next_row.xpath("string(td[1]/font)").strip()
            when = re.sub("\s+", " ", when)
            when = "%s %s" % (when, year)

            continu = False
            for slug in exclude_slugs:
                if slug in when:
                    continu = True

            for repl in replacements:
                if repl in when:
                    when = when.replace(repl, replacements[repl])

            if continu:
                continue

            parsed_when = None
            for fmt in formats:
                try:
                    parsed_when = datetime.datetime.strptime(when, fmt)
                    break
                except ValueError:
                    pass

            if not parsed_when:
                raise

            when = parsed_when
            if when < now:
                self.warning("Dropping an event at %s. Be careful!" % (
                    when
                ))
                continue

            when = self._tz.localize(when)

            where = next_row.xpath("string(td[2]/font)").strip()

            description = "Committee Meeting\n"
            description += comm

            links = font.xpath(
                "../../td/font/a[contains(@href, 'get_documents')]")
            if links:
                agenda_link = links[0]
                event['link'] = agenda_link.attrib['href']

            cur_node = font.getparent().getparent()
            bills = []
            while cur_node is not None and cur_node.xpath(".//hr") == []:
                bills += cur_node.xpath(
                    ".//a[contains(@href, 'get_complete_bill')]/text()")
                cur_node = cur_node.getnext()


            event = Event(session, when, 'committee:meeting',
                          description, location=where)

            event.add_source(url)
            for bill in bills:
                event.add_related_bill(bill,
                                       description='Related Bill',
                                       type='consideration')

            event.add_participant('host',
                                  comm,
                                  participant_type='committee',
                                  chamber=chamber)
            self.save_event(event)
Exemplo n.º 50
0
    def scrape_meeting(self, session, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title, = page.xpath("//a[@id='linkTitle']//text()")
        date, = page.xpath("//span[@id='lDate']/text()")
        time, = page.xpath("//span[@id='lTime']/text()")
        location, = page.xpath("//span[@id='lLocation']/text()")

        substs = {
            "AM": ["A.M.", "a.m."],
            "PM": ["P.M.", "p.m.", "Noon"],
        }

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r'(?i)\d[AP]M$', time):
            time = time[:-2] + " " + time[-2:]

        if "UPON ADJ" in ' '.join(time.split()).upper():
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = datetime.datetime.strptime("%s %s" % (date, time),
                                              "%B %d, %Y %I:%M %p")

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {
            "house": "lower",
            "senate": "upper",
            "joint": "joint",
        }

        for chamber_, normalized in chambers.items():
            if chamber_ in title.lower():
                chamber = normalized
                break
        else:
            return

        event = Event(session,
                      when,
                      'committee:meeting',
                      description,
                      location=location,
                      all_day=all_day)
        event.add_source(url)

        event.add_participant('host', title, 'committee', chamber=chamber)

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title:
                bill_chamber = "upper"
            elif "H" in bill_title:
                bill_chamber = "lower"
            else:
                continue

            event.add_related_bill(bill_id=bill_title,
                                   description=descr.text_content(),
                                   chamber=bill_chamber,
                                   type='consideration')
        self.save_event(event)
Exemplo n.º 51
0
    def scrape_page(self, chamber, session):
        url = pages[chamber]
        page = self.lxmlize(url)

        rows = page.xpath("//table[@class='MsoNormalTable']/tr")
        header = rows[0]
        rows = rows[1:]

        dates = {}
        dIdex = 0
        for row in header.xpath(".//td")[1:]:
            date = row.text_content()
            date = re.sub("\s+", " ", date).strip()
            dates[dIdex] = date
            dIdex += 1

        def _parse_time_block(block):
            if block.strip() == "No Meeting":
                return None, None, []
            bills = []
            room = None

            blocks = [x.strip() for x in block.split("\n")]

            hour = re.sub("\(.*\)", "", blocks[1])
            bills = blocks[2]

            if "after" in hour or "after" in bills:
                return None, None, []

            # Extra time cleanup
            # "Rm"
            if "Rm" in hour:
                inf = hour.split("Rm")
                assert len(inf) == 2
                room = inf[1]
                hour = inf[0]

            # "and"
            hour = [x.strip() for x in hour.split('and')]

            # We'll pass over this twice.
            single_bill = re.search("(H|S)(B|R) \d{3}", bills)
            if single_bill is not None:
                start, end = single_bill.regs[0]
                description = bills
                bills = bills[start:end]
                bills = [{"bill_id": bills, "description": description}]
            else:
                multi_bills = re.search("(H|S)(B|R|M)s (\d{3}(; )?)+", bills)
                if multi_bills is not None:
                    # parse away.
                    bill_array = bills.split()
                    type = bill_array[0]
                    bill_array = bill_array[1:]
                    bill_array = [
                        x.replace(";", "").strip() for x in bill_array
                    ]
                    type = type.replace("s", "")
                    bill_array = [{
                        "bill_id": "%s %s" % (type, x),
                        "description": bills
                    } for x in bill_array]
                    bills = bill_array

            return hour, room, bills

        for row in rows:
            tds = row.xpath(".//td")
            ctty = re.sub("\s+", " ", tds[0].text_content().strip())
            times = tds[1:]
            for i in range(0, len(times)):
                hours, room, bills = _parse_time_block(times[i].text_content())
                if hours is None or bills == []:
                    continue

                for hour in hours:
                    datetime = "%s %s" % (dates[i], hour)
                    datetime = datetime.encode("ascii", "ignore")

                    # DAY_OF_WEEK MONTH/DAY/YY %I:%M %p"
                    datetime = dt.datetime.strptime(datetime,
                                                    "%A %m/%d/%y %I:%M %p")
                    event = Event(session, datetime, 'committee:meeting',
                                  'Meeting Notice', "Room %s" % (room))
                    event.add_source(url)
                    for bill in bills:
                        event.add_related_bill(bill['bill_id'],
                                               description=bill['description'],
                                               type='consideration')
                    event.add_participant("host", ctty, chamber=chamber)
                    self.save_event(event)
Exemplo n.º 52
0
    def scrape_event_page(self, url, chamber, session):
        page = self.lxmlize(url)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {"txt": val.text_content().strip(), "obj": val}

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (metainf['Date']['txt'], metainf['Time']['txt'])
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM"  #another weird one
        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub("\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**"
        ]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace('p.m.', 'pm')
        datetime = datetime.replace('Noon', "pm")
        datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title

        if chamber == 'other':
            chamber = 'joint'

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      title,
                      location=where)
        event.add_source(url)
        event.add_source(mi_events)

        event.add_participant('chair',
                              metainf['Chair']['txt'],
                              'legislator',
                              chamber=chamber)

        event.add_participant('host',
                              metainf['Committee']['txt'],
                              'committee',
                              chamber=chamber)

        agenda = metainf['Agenda']['obj']
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            event.add_related_bill(bill.text_content(),
                                   description=description,
                                   type='consideration')

        self.save_event(event)
Exemplo n.º 53
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/"
                        "CalendarMenu/CommitteeMenu.aspx" % str(session))

        page = self.lxmlize(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = '%02d' % (i + 2)

            date_xpath = ('.//span[@id="ctl00_cphContent_gv'
                          'Calendars_ctl%s_lblDate"]' % str(row_ident))
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata['chambers'][chamber]['name'][0].upper(
            )
            meeting_xpath = ('.//a[@id="ctl00_cphContent_gv'
                             'Calendars_ctl%s_hl%scallink"]' %
                             (str(row_ident), chamber_char))
            meeting_url = row.xpath(meeting_xpath)

            if (len(meeting_url) == 1
                    and meeting_url[0].text_content().strip() != ''):
                try:
                    meeting_url = meeting_url[0].attrib['href']
                except KeyError:
                    self.warning("Alleged meeting date has no URL: " +
                                 meeting_url[0].text_content().strip())
                    continue

                meeting_page = self.lxmlize(meeting_url)
                meetings = meeting_page.xpath(
                    './/table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(
                            meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [
                            meeting_ident, meeting_idents[i + 1] - 1
                        ]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p')
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == '':
                        description = committee

                    event = Event(session, meeting_date_time,
                                  'committee:meeting', description, location)

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill['bill_description'] == '':
                            bill['bill_description'] = committee

                        event.add_related_bill(
                            bill_id=bill['bill_id'],
                            description=bill['bill_description'],
                            type='consideration')
                        event.add_document(name=bill['bill_id'],
                                           url=bill['bill_url'],
                                           type='bill',
                                           mimetype='application/pdf')

                    event.add_participant(type='host',
                                          participant=committee,
                                          participant_type='committee',
                                          chamber=chamber)

                    self.save_event(event)
Exemplo n.º 54
0
    def scrape_lower(self, session):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text')
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1: ]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1: ]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and not x.strip()[0].isdigit()
                                ]).decode('ascii', 'ignore')

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                session=session,
                                when=time,
                                type='committee:meeting',
                                description=description,
                                location=location
                                )

                        event.add_source(PDF_URL)
                        event.add_participant(
                                type='host',
                                participant=comm,
                                participant_type='committee',
                                chamber='lower'
                                )
                        for line in description.split('\n'):
                            related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                event.add_related_bill(
                                        bill_id=related_bill,
                                        type='consideration',
                                        description=relation
                                        )

                        self.save_event(event)
Exemplo n.º 55
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/"
                        "CalendarMenu/CommitteeMenu.aspx" % str(session))

        page = self.get_page_from_url(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = '%02d' % (i + 2)

            date_xpath = ('.//span[@id="ctl00_cphContent_gv'
                          'Calendars_ctl%s_lblDate"]' % str(row_ident))
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata['chambers'][
                chamber]['name'][0].upper()
            meeting_xpath = ('.//a[@id="ctl00_cphContent_gv'
                             'Calendars_ctl%s_hl%scallink"]' % (
                                 str(row_ident), chamber_char
                             ))
            meeting_url = row.xpath(meeting_xpath)

            if (len(meeting_url) == 1 and
                    meeting_url[0].text_content().strip() != ''):
                meeting_url = meeting_url[0].attrib['href']
                meeting_page = self.get_page_from_url(meeting_url)
                meetings = meeting_page.xpath(
                    './/table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [
                            meeting_ident, meeting_idents[i + 1] - 1
                        ]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p')
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == '':
                        description = committee

                    event = Event(
                        session,
                        meeting_date_time,
                        'committee:meeting',
                        description,
                        location
                    )

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill['bill_description'] == '':
                            bill['bill_description'] = committee

                        event.add_related_bill(
                            bill_id=bill['bill_id'],
                            description=bill['bill_description'],
                            type='consideration'
                        )
                        event.add_document(
                            name=bill['bill_id'],
                            url=bill['bill_url'],
                            type='bill',
                            mimetype='application/pdf'
                        )

                    event.add_participant(
                        type='host',
                        participant=committee,
                        participant_type='committee',
                        chamber=chamber
                    )

                    self.save_event(event)
Exemplo n.º 56
0
    def scrape(self, chamber, session):

        cha = {
            "upper" : "senate",
            "lower" : "house",
            "other" : "joint"
        }[chamber]

        print_format = "%m/%d/%Y"

        now = dt.datetime.now()
        start = now.strftime(print_format)
        then = now + timedelta(weeks=4)
        end = then.strftime(print_format)
        url = event_page % (
            cha,
            start,
            end
        )

        page = self.lxmlize(url)

        def _split_tr(trs):
            ret = []
            cur = []
            for tr in trs:
                if len(tr.xpath(".//hr")) > 0:
                    ret.append(cur)
                    cur = []
                    continue
                cur.append(tr)
            if cur != []:
                ret.append(cur)
            return ret

        tables = page.xpath("//table[@class='AgendaCommittee']")
        for table in tables:
            # grab agenda, etc
            trs = table.xpath(".//tr")
            events = _split_tr(trs)
            for event in events:
                assert len(event) == 2
                header = event[0]
                body = event[1]
                whowhen = header.xpath(".//h2")[0].text_content()
                blocks = [ x.strip() for x in whowhen.rsplit("-", 1) ]
                who = blocks[0]
                when = blocks[1].replace(u'\xa0', ' ')
                if "TBA" in when:
                    continue  # XXX: Fixme

                descr = body.xpath(".//*")
                flush = False
                where = body.xpath(".//br")[1].tail
                if where is not None:
                    where = where.strip()
                else:
                    where = "unknown"

                when = dt.datetime.strptime(when, "%m/%d/%y  %I:%M %p")

                meeting_title = "Scheduled Meeting"  # XXX: Fixme

                agenda = self.scrape_agenda(body.xpath(".//ol"))
                event = Event(session, when, 'committee:meeting',
                              meeting_title, location=where)
                event.add_participant(
                    "host",
                    who,
                    chamber=chamber
                )
                event.add_source(url)

                for item in agenda:
                    bill = item['bill']
                    descr = item['descr']
                    event.add_related_bill(
                        bill,
                        description=descr,
                        type="consideration"
                    )
                self.save_event(event)
Exemplo n.º 57
0
    def scrape(self, chamber, session):
        if chamber != 'other':
            return None  # We're going to do it all on one shot.

        if session[-2:] == "s1":
            return None  # Special sessions 404

        url = "http://mlis.state.md.us/%s/hearsch/alladd.htm" % ( session )
        page = self.lxmlize(url)
        events = page.xpath("//pre")
        for event in events:
            ctty_name = [
                x.strip() for x in
                event.getparent().getprevious().text_content().split("-", 1)
            ]
            ctty_name = ctty_name[0]
            event_text = event.text_content()
            if "This meeting has been cancelled." in event_text:
                continue
            # OK. In order to process this text-only notice, we have to resort
            # to some major hackage. Just roll with it.
            lines = event_text.split("\n")
            # In order to get the key stuff, we need to figure out where the
            # address "block" starts.
            address_block = last_space(lines[4])
            assert address_block is not None
            # OK. Given the offset, we can "split" the time off the date block.
            time_room = lines[3]
            time = time_room[:address_block].strip()

            if "TBD" in time:
                continue  # Nothing's set yet.
            time = "%s %s" % (
                lines[1],
                time
            )
            time = re.sub("\s+", " ", time).strip()
            trans = {
                "P.M." : "PM",
                "A.M." : "AM"
            }
            for transition in trans:
                time = time.replace(transition, trans[transition])

            when = dt.datetime.strptime(time, "%A %B %d, %Y %I:%M %p")

            room = time_room[address_block:].strip()
            place_block = lines[4:]
            where = room + "\n"
            done = False
            offset = 4
            for place in place_block:
                if place.strip() == "":
                    done = True
                if done:
                    continue
                offset += 1
                where += place.strip() + "\n"
            where = where.strip()
            # Now that the date's processed, we can move on.
            moreinfo = lines[offset + 1:]
            info = {}
            key = "unattached_header"
            for inf in moreinfo:
                if ":" in inf:
                    key, value = inf.split(":", 1)
                    key = key.strip()
                    info[key] = value.strip()
                else:
                    info[key] += " " + inf.strip()
            # Alright. We should have enough now.
            subject = info['Subject']

            event = Event(session, when, 'committee:meeting',
                          subject, location=where)
            event.add_source(url)

            flags = {
                "joint": "joint",
                "house": "lower",
                "senate": "upper"
            }
            chamber = "other"
            for flag in flags:
                if flag in ctty_name.lower():
                    chamber = flags[flag]

            # Let's try and hack out some bill names.
            trans = {
                "SENATE": "S",
                "HOUSE": "H",
                "JOINT": "J",
                "BILL": "B",
                "RESOLUTION": "R",
            }
            _t_subject = subject.upper()
            for t in trans:
                regex = "%s(\s+)?" % t
                _t_subject = re.sub(regex, trans[t], _t_subject)
            print _t_subject
            bills = re.findall("(S|H)(J)?(B|R|M)\s*(\d{4})", _t_subject)
            for bill in bills:
                name = bill[:3]
                bid = bill[3]
                bill_id = "%s %s" % ( ''.join(name), bid )
                event.add_related_bill(bill_id,
                                       description=subject,
                                       type='consideration')


            event.add_participant("host", ctty_name, chamber=chamber)

            self.save_event(event)
Exemplo n.º 58
0
    def scrape_committee_agendas(self, chamber, session):
        """
        Scrape upper or lower committee agendas
        """
        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \
                                          self._chamber_short[chamber]
        with self.urlopen(url) as agendas:
            root = html.fromstring(agendas)
            if chamber == 'upper':
                event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                         '/td/table/tr/td/table')[0]
            else:
                event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                         '/td/table/tr/td/table/tr/td/table')[0]
            for row in event_table.xpath('tr')[2:]:
                # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
                # HTML Document, PDF Document for house
                # Agenda Date, Committee, Revised, Cancelled, Time, Room,
                # HTML Document, PDF Document for senate
                text = [ x.text_content().strip() for x in row.xpath('td') ]
                when, committee = text[0:2]
                if chamber == 'upper':
                    time, room = text[4:6]
                    link = row[6].xpath('string(a/@href)')
                else:
                    time, room = text[5:7]
                    link = row[7].xpath('string(a/@href)')
                if 'NOT MEETING' in time or 'CANCELLED' in time:
                    continue
                time = re.match('(\d+:\d+ (A|P))', time)
                if time:
                    when = "%s %sM" % (text[0], time.group(0))
                    when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
                else:
                    when = text[0]
                    when = datetime.datetime.strptime(when, '%m/%d/%Y')

                when = self._tz.localize(when)

                title = "Committee Meeting:\n%s %s %s\n" % (
                                                  self._chamber_long[chamber],
                                                  committee, room)
                agenda_info = self.parse_agenda(chamber, link)

                description = agenda_info['description']
                member_list = agenda_info['member_list']
                meeting_type = agenda_info['meeting_type']
                agenda_items = agenda_info['agenda_items']
                related_bills= agenda_info['related_bills']
                other = agenda_info['other']

                event = Event(session, when, 'committee:meeting', title,
                              location=room, link=link, details=description) #,
                              #agenda=agenda_items)
                event.add_participant('committee', committee, chamber=chamber)

                for bill in related_bills:
                    event.add_related_bill(bill, type="consideration")

                event['participants'].extend(member_list)
                event.add_source(url)
                event.add_source(link)
                self.save_event(event)
Exemplo n.º 59
0
    def scrape_page(self, chamber, session):
        url = pages[chamber]
        page = self.lxmlize(url)

        rows = page.xpath("//table[@class='MsoNormalTable']/tr")
        header = rows[0]
        rows = rows[1:]

        week_of = page.xpath("//h3[@align='center']/b/text()")[0]
        match = re.match(
            "(?i)Week of (?P<month>.*) (?P<day>\d+), (?P<year>\d{4})",
            week_of)
        day_info = match.groupdict()
        monday_dom = int(day_info['day'])
        days = ["monday", "tuesday", "wednesday", "thursday",
                "friday", "saturday", "sunday"]

        dates = {}
        dIdex = 0
        for row in header.xpath(".//td")[1:]:
            date = row.text_content()
            date = re.sub("\s+", " ", date).strip()
            dates[dIdex] = date
            dIdex += 1

        def _parse_time_block(block):
            if block.strip() == "No Meeting":
                return None, None, []
            bills = []
            room = None

            blocks = [ x.strip() for x in block.split("\n") ]

            hour = re.sub("\(.*\)", "", blocks[1])
            bills = blocks[2]
            bills = bills.encode('ascii', errors='ignore')

            if "after" in hour or "after" in bills:
                return None, None, []

            # Extra time cleanup
            # "Rm"
            if "Rm" in hour:
                inf = hour.split("Rm")
                assert len(inf) == 2
                room = inf[1]
                hour = inf[0]

            # "and"
            hour = [ x.strip() for x in hour.split('and') ]

            # We'll pass over this twice.
            single_bill = re.search("(H|S)(C?)(B|R) \d+", bills)
            if single_bill is not None:
                start, end = single_bill.regs[0]
                description = bills
                bills = bills[start:end]
                bills = [{
                    "bill_id": bills,
                    "description": description
                }]
            else:
                multi_bills = re.search("(H|S)(B|R|M)s (\d+((;,) )?)+",
                                        bills)
                if multi_bills is not None:
                    # parse away.
                    bill_array = bills.split()
                    type = bill_array[0]
                    bill_array = bill_array[1:]

                    def _c(f):
                        for thing in [";", ",", "&", "*"]:
                            f = f.replace(thing, "")
                        return re.sub("\s+", " ", f).strip()

                    bill_array = [_c(x) for x in bill_array]
                    type = type.replace("s", "")
                    bill_array = [
                        { "bill_id": "%s %s" % ( type, x ),
                          "description": bills } for x in bill_array ]
                    bills = bill_array

                else:
                    self.warning("Unknwon bill thing: %s" % (bills))
                    bills = []

            return hour, room, bills

        for row in rows:
            tds = row.xpath(".//td")
            ctty = re.sub("\s+", " ", tds[0].text_content().strip())
            times = tds[1:]
            for i in range(0, len(times)):
                hours, room, bills = _parse_time_block(times[i].text_content())
                if hours is None or bills == []:
                    continue

                for hour in hours:
                    datetime = "%s %s" % ( dates[i], hour )
                    datetime = datetime.encode("ascii", "ignore")

                    # DAY_OF_WEEK MONTH/DAY/YY %I:%M %p"
                    dow, time = datetime.split()
                    month = day_info['month']
                    year = day_info['year']
                    day = monday_dom + days.index(dow.lower())

                    datetime = "%s %s %s, %s %s" % (
                        dow, month, day, year, time
                    )

                    formats = [
                        "%A %B %d, %Y %I:%M %p",
                        "%A %B %d, %Y %I:%M%p",
                        "%A %B %d, %Y %I %p",
                        "%A %B %d, %Y %I%p",
                    ]

                    dtobj = None
                    for fmt in formats:
                        try:
                            dtobj = dt.datetime.strptime(datetime, fmt)
                        except ValueError as e:
                            continue

                    if dtobj is None:
                        self.warning("Unknown guy: %s" % (datetime))
                        raise Exception

                    datetime = dtobj

                    event = Event(session,
                                  datetime,
                                  'committee:meeting',
                                  'Meeting Notice',
                                  "Room %s" % (room))
                    event.add_source(url)

                    for bill in bills:
                        event.add_related_bill(
                            bill['bill_id'],
                            description=bill['description'],
                            type='consideration'
                        )

                    event.add_participant("host",
                                          ctty, 'committee', chamber=chamber)
                    self.save_event(event)