示例#1
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
                try:
                    guid = link.attrib['href']
                except KeyError:
                    continue  # Sometimes we have a dead link. This is only on
                    # dead entries.

                committee = link.xpath("string(../../../td[1])").strip()

                when_and_where = link.xpath("string(../../../td[2])").strip()

                location = when_and_where.split(',')[-1]
                when = parse_datetime(when_and_where, session)



                description = 'Committee Meeting: %s' % committee

                event = Event(session, when, 'committee:meeting',
                              description, location=location)
                event.add_source(url)
                event.add_participant('host', committee, chamber='lower')
                event.add_document("Agenda", guid, type='agenda',
                                   mimetype="application/pdf")
                event['link'] = guid

                self.save_event(event)
示例#2
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            if not when.endswith(session[:len("20XX")]):
                continue

            event = Event(session=session,
                          when=datetime.datetime.strptime(when, '%b %d, %Y'),
                          type='committee:meeting',
                          description=title,
                          location='State Capitol')
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                if committee_name.lower().startswith("Senate"):
                    chamber = "upper"
                elif committee_name.lower().startswith("House"):
                    chamber = "lower"
                else:
                    chamber = "joint"
                event.add_participant(type='host',
                                      participant=committee_name,
                                      participant_type='committee',
                                      chamber=chamber)

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)',
                                document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(name=document.xpath('text()')[0],
                                   url=url,
                                   mimetype='application/pdf')
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        event.add_related_bill(
                            bill_name,
                            type='consideration',
                            description='Bill up for discussion')

            self.save_event(event)
示例#3
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)",
                            when_and_where).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#4
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmt = "%A, %B %d, %Y"
        if time in all_day:
            datetime = date
        else:
            fmt += " %I:%M %p"
            datetime = "%s %s" % ( date, time )
        datetime = dt.datetime.strptime(datetime, fmt)

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, chamber=chamber)

        self.save_event(event)
示例#5
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [
                    committee,
                ]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids[committee]
                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host',
                                      committee['name'],
                                      'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name, notice_href, mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(bill['bill_id'],
                                       description=bill['descr'],
                                       type=bill['type'])

            self.save_event(event)
示例#6
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '')
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                                     '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(',') if s] + [None] * 3)[:3]
            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee_name,
                                  'committee',
                                  chamber='lower')
            event.add_document('Agenda',
                               guid,
                               type='agenda',
                               mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
示例#7
0
    def scrape(self, session, chambers):
        get_short_codes(self)

        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            bills = [x.text_content() for x in tds[1].xpath(".//a")]
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0]
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")

            event = Event(session, when, 'committee:meeting', descr,
                          location=where)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee,]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }

                event.add_participant('host', committee['name'], 'committee',
                                      chamber=committee['chamber'])

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               mimetype='text/html')

            for bill in self.get_related_bills(notice_href):
                event.add_related_bill(
                    bill['bill_id'],
                    description=bill['descr'],
                    type=bill['type']
                )

            self.save_event(event)
示例#8
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src, '
                    '"PDF-AGENDA.png")]]/@href')[0]
                self.logger.debug(guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]
            
            # check for time in date because of missing comma
            time_srch = re.search('\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            self.logger.debug(location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            self.logger.debug(description)

            event = Event(session, when, 'committee:meeting',
                description, location=location)
            event.add_source(url)
            event.add_participant('host', committee_name, 'committee',
                chamber='lower')
            event.add_document('Agenda', guid, type='agenda',
                mimetype='application/pdf')
            event['link'] = guid

            self.save_event(event)
示例#9
0
def test_event():
    e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting',
              'event description', 'event location')
    e.add_document('agenda', 'http://example.com/event/agenda.txt')
    e.add_related_bill('HB 1', relation='considered')
    assert_equal(e['documents'],
                 [{'name': 'agenda',
                   'url': 'http://example.com/event/agenda.txt',
                   'type': 'other'}])
    assert_equal(e['related_bills'],
                 [{'bill_id': 'HB 1', 'relation': 'considered'}])
示例#10
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()
            when_and_where = re.sub("\s+", " ", when_and_where).strip()
            if "@" in when_and_where:
                continue  # Contains no time data.

            if when_and_where.strip() == "":
                continue

            info = re.match(
                r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)",
                when_and_where
            ).groupdict()

            when_and_where = info['when']
            location = info['where']

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # when = self._tz.localize(when)

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session, when, 'committee:meeting',
                          description, location=location)
            event.add_source(url)
            event.add_participant('host', committee, 'committee',
                                  chamber='lower')
            event.add_document("Agenda", guid, type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill, description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#11
0
    def scrape(self, session, chambers):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(session, comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(session,
                          when,
                          'committee:meeting',
                          name,
                          location=where,
                          link=notice)

            event.add_source(calurl)
            event.add_participant('host',
                                  cttie,
                                  'committee',
                                  chamber=cttie_chamber)
            event.add_document("notice", notice, mimetype='application/pdf')

            for thing in who:
                event.add_participant(thing['title'],
                                      thing['name'],
                                      'legislator',
                                      chamber=cttie_chamber)

            self.save_event(event)
示例#12
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../../td[1])").strip()

            when_and_where = link.xpath("string(../../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session,
                          when,
                          'committee:meeting',
                          description,
                          location=location)
            event.add_source(url)
            event.add_participant('host',
                                  committee,
                                  'committee',
                                  chamber='lower')
            event.add_document("Agenda",
                               guid,
                               type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill,
                                       description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#13
0
def test_event():
    e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting',
              'event description', 'event location')
    e.add_document('agenda', 'http://example.com/event/agenda.txt')
    e.add_related_bill('HB 1', relation='considered')
    assert_equal(e['documents'], [{
        'name': 'agenda',
        'url': 'http://example.com/event/agenda.txt',
        'type': 'other'
    }])
    assert_equal(e['related_bills'], [{
        'bill_id': 'HB 1',
        'relation': 'considered'
    }])
示例#14
0
    def scrape(self, session, chambers):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath(
                '//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            event = Event(
                    session=session,
                    when=self._TZ.localize(datetime.datetime.strptime(
                            info.xpath('span[@class="col02"]/text()')[0],
                            self._DATETIME_FORMAT
                            )),
                    type='committee:meeting',
                    description=" ".join(x.strip() for x
                            in doc.xpath('//div[@class="schedule"]//text()')
                            if x.strip()),
                    location=doc.xpath(
                            '//div[@class="heading-container"]/span/text()')
                            [0].title()
                    )

            event.add_participant(
                    type='host',
                    participant=info.xpath(
                            'span[@class="col01"]/text()')[0].title(),
                    participant_type='committee'
                    )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=document.xpath('@href')[0]
                        )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            self.save_event(event)
示例#15
0
    def scrape(self, session, chambers):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(session, comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(session, when, 'committee:meeting', name,
                          location=where, link=notice)

            event.add_source(calurl)
            event.add_participant('host', cttie, 'committee',
                                  chamber=cttie_chamber)
            event.add_document("notice", notice, mimetype='application/pdf')

            for thing in who:
                event.add_participant(thing['title'], thing['name'],
                                      'legislator', chamber=cttie_chamber)

            self.save_event(event)
示例#16
0
    def scrape_house_weekly_schedule(self, session):
        url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm"

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."):
            try:
                guid = link.attrib['href']
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee = link.xpath("string(../../td[1])").strip()

            when_and_where = link.xpath("string(../../td[2])").strip()

            location = when_and_where.split(',')[-1]

            if when_and_where.strip() == "":
                continue

            year = datetime.datetime.now().year
            when = parse_datetime(when_and_where, year)  # We can only scrape
            # current year's events in LA.

            bills = self.scrape_bills(when_and_where)

            description = 'Committee Meeting: %s' % committee

            event = Event(session, when, 'committee:meeting',
                          description, location=location)
            event.add_source(url)
            event.add_participant('host', committee, 'committee',
                                  chamber='lower')
            event.add_document("Agenda", guid, type='agenda',
                               mimetype="application/pdf")
            for bill in bills:
                event.add_related_bill(bill, description=when_and_where,
                                       type='consideration')
            event['link'] = guid

            self.save_event(event)
示例#17
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            (title, when) = info.xpath('title/text()')[0].split(" - ")
            if not when.endswith(session[ :len("20XX")]):
                continue

            event = Event(
                    session=session,
                    when=datetime.datetime.strptime(when, '%b %d, %Y'),
                    type='committee:meeting',
                    description=title,
                    location='State Capitol'
                    )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            doc = self.lxmlize(url)
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(
                        type='host',
                        participant=committee_name,
                        participant_type='committee'
                        )

            documents = doc.xpath('.//td')
            for document in documents:
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]).group(1),
                        mimetype='application/pdf'
                        )

            self.save_event(event)
示例#18
0
    def scrape(self, session, chambers):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            if not when.endswith(session[ :len("20XX")]):
                continue

            event = Event(
                    session=session,
                    when=datetime.datetime.strptime(when, '%b %d, %Y'),
                    type='committee:meeting',
                    description=title,
                    location='State Capitol'
                    )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                if committee_name.lower().startswith("Senate"):
                    chamber = "upper"
                elif committee_name.lower().startswith("House"):
                    chamber = "lower"
                else:
                    chamber = "joint"
                event.add_participant(
                        type='host',
                        participant=committee_name,
                        participant_type='committee',
                        chamber = chamber
                        )

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        name=document.xpath('text()')[0],
                        url=url,
                        mimetype='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        event.add_related_bill(bill_name,
                            type='consideration',
                            description='Bill up for discussion')



            self.save_event(event)
示例#19
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        if time in all_day:
            datetime = date
        else:
            datetime = "%s %s" % ( date, time )
        if "CANCELLED" in datetime or "Rise of the House" in datetime:
            # XXX: Do something more advanced.
            return

        transtable = {
            "P.M" : "PM",
            "PM." : "PM",
            "P.M." : "PM",
            "A.M." : "AM",
            "POSTPONED" : "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(session, datetime, 'committee:meeting',
                      'Meeting Notice', location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(), bill_ft, type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [ x.text_content() for x in root ]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
                ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {
            "house" : "lower",
            "joint" : "joint",
            "senate" : "upper"
        }
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, 'committee', chamber=chamber)

        self.save_event(event)
示例#20
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/"
                        "CalendarMenu/CommitteeMenu.aspx" % str(session))

        page = self.get_page_from_url(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = '%02d' % (i + 2)

            date_xpath = ('.//span[@id="ctl00_cphContent_gv'
                          'Calendars_ctl%s_lblDate"]' % str(row_ident))
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata['chambers'][
                chamber]['name'][0].upper()
            meeting_xpath = ('.//a[@id="ctl00_cphContent_gv'
                             'Calendars_ctl%s_hl%scallink"]' % (
                                 str(row_ident), chamber_char
                             ))
            meeting_url = row.xpath(meeting_xpath)

            if (len(meeting_url) == 1 and
                    meeting_url[0].text_content().strip() != ''):
                meeting_url = meeting_url[0].attrib['href']
                meeting_page = self.get_page_from_url(meeting_url)
                meetings = meeting_page.xpath(
                    './/table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [
                            meeting_ident, meeting_idents[i + 1] - 1
                        ]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p')
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == '':
                        description = committee

                    event = Event(
                        session,
                        meeting_date_time,
                        'committee:meeting',
                        description,
                        location
                    )

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill['bill_description'] == '':
                            bill['bill_description'] = committee

                        event.add_related_bill(
                            bill_id=bill['bill_id'],
                            description=bill['bill_description'],
                            type='consideration'
                        )
                        event.add_document(
                            name=bill['bill_id'],
                            url=bill['bill_url'],
                            type='bill',
                            mimetype='application/pdf'
                        )

                    event.add_participant(
                        type='host',
                        participant=committee,
                        participant_type='committee',
                        chamber=chamber
                    )

                    self.save_event(event)
示例#21
0
    def scrape(self, chamber, session):
        if chamber == 'other':
            return

        calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/"
                        "CalendarMenu/CommitteeMenu.aspx" % str(session))

        page = self.lxmlize(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = '%02d' % (i + 2)

            date_xpath = ('.//span[@id="ctl00_cphContent_gv'
                          'Calendars_ctl%s_lblDate"]' % str(row_ident))
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata['chambers'][chamber]['name'][0].upper(
            )
            meeting_xpath = ('.//a[@id="ctl00_cphContent_gv'
                             'Calendars_ctl%s_hl%scallink"]' %
                             (str(row_ident), chamber_char))
            meeting_url = row.xpath(meeting_xpath)

            if (len(meeting_url) == 1
                    and meeting_url[0].text_content().strip() != ''):
                try:
                    meeting_url = meeting_url[0].attrib['href']
                except KeyError:
                    self.warning("Alleged meeting date has no URL: " +
                                 meeting_url[0].text_content().strip())
                    continue

                meeting_page = self.lxmlize(meeting_url)
                meetings = meeting_page.xpath(
                    './/table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(
                            meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [
                            meeting_ident, meeting_idents[i + 1] - 1
                        ]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p')
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == '':
                        description = committee

                    event = Event(session, meeting_date_time,
                                  'committee:meeting', description, location)

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill['bill_description'] == '':
                            bill['bill_description'] = committee

                        event.add_related_bill(
                            bill_id=bill['bill_id'],
                            description=bill['bill_description'],
                            type='consideration')
                        event.add_document(name=bill['bill_id'],
                                           url=bill['bill_url'],
                                           type='bill',
                                           mimetype='application/pdf')

                    event.add_participant(type='host',
                                          participant=committee,
                                          participant_type='committee',
                                          chamber=chamber)

                    self.save_event(event)
示例#22
0
    def scrape(self, chamber, session):
        if chamber == "other":
            return

        calendar_url = "http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)

        page = self.lxmlize(calendar_url)

        rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr')

        for i, row in enumerate(rows):

            row_ident = "%02d" % (i + 2)

            date_xpath = './/span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)
            date_string = row.xpath(date_xpath)[0].text_content()

            chamber_char = self.metadata["chambers"][chamber]["name"][0].upper()
            meeting_xpath = './/a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % (
                str(row_ident),
                chamber_char,
            )
            meeting_url = row.xpath(meeting_xpath)

            if len(meeting_url) == 1 and meeting_url[0].text_content().strip() != "":
                try:
                    meeting_url = meeting_url[0].attrib["href"]
                except KeyError:
                    self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip())
                    continue

                meeting_page = self.lxmlize(meeting_url)
                meetings = meeting_page.xpath('.//table[@class="MsoNormalTable"]/tr')
                meeting_idents = []
                meeting_ident = 0

                # breaking the meetings into arrays (meeting_data) for
                # processing. meeting_ident is the first row of the meeting
                # (time, committee, location)
                for meeting in meetings:
                    if self.is_row_a_new_meeting(meeting):
                        meeting_idents.append(meeting_ident)
                    meeting_ident += 1

                for i, meeting_ident in enumerate(meeting_idents):

                    if len(meeting_idents) == 1 or i + 1 == len(meeting_idents):
                        ident_start, ident_end = [meeting_ident, 0]
                        meeting_data = meetings[ident_start:]
                    else:
                        ident_start, ident_end = [meeting_ident, meeting_idents[i + 1] - 1]

                        if ident_end - ident_start == 1:
                            ident_end = ident_start + 2

                        meeting_data = meetings[ident_start:ident_end]
                    committee = self.get_committee(meeting_data)
                    meeting_time = self.get_meeting_time(meeting_data)
                    meeting_date_time = datetime.datetime.strptime(
                        date_string + " " + meeting_time, "%m/%d/%Y %I:%M %p"
                    )
                    meeting_date_time = self._tz.localize(meeting_date_time)

                    location = self.get_location(meeting_data)
                    description = self.get_meeting_description(meeting_data)
                    bills = self.get_bills(meeting_data)

                    if description == "":
                        description = committee

                    event = Event(session, meeting_date_time, "committee:meeting", description, location)

                    event.add_source(meeting_url)

                    for bill in bills:

                        if bill["bill_description"] == "":
                            bill["bill_description"] = committee

                        event.add_related_bill(
                            bill_id=bill["bill_id"], description=bill["bill_description"], type="consideration"
                        )
                        event.add_document(
                            name=bill["bill_id"], url=bill["bill_url"], type="bill", mimetype="application/pdf"
                        )

                    event.add_participant(
                        type="host", participant=committee, participant_type="committee", chamber=chamber
                    )

                    self.save_event(event)
示例#23
0
    def scrape(self, session, chambers):
        hansard_urls = {'39th1st': 'http://www.leg.bc.ca/hansard/39th1st/index.htm',
                        '39th2nd': 'http://www.leg.bc.ca/hansard/39th2nd/index.htm',
                        '39th3rd': 'http://www.leg.bc.ca/hansard/39th3rd/index.htm',
                        '39th4th': 'http://www.leg.bc.ca/hansard/8-8.htm'}
        url = hansard_urls[session]

        page = self.lxmlize(url)
        for row in page.xpath("//table/tr"):
            hansard_id = row.xpath(".//td[@align='left']")
            ids = row.xpath(".//td[@align='left']/p")
            web_links = row.xpath(".//a[contains(text(), 'HTML')]")
            pdf_links = row.xpath(".//a[contains(text(), 'PDF')]")

            if web_links == [] and pdf_links == []:
                continue
            if ids == []:
                continue

            if len(web_links) != 1:
                continue  # XXX: Bug, deal with me! We sometimes get a ton
                # of unwanted hansard. Some of the xpath must be wrong.

            ids = ids[-1]
            date = ids.text.strip()
            hansard_id = ids.xpath(".//br")[0].tail
            hansard_id = re.sub("\s+", " ", hansard_id).strip()
            if date == "":
                continue

            times_of_day = ["Morning", "Afternoon"]
            time_of_day = None
            for time in times_of_day:
                if date.endswith(time):
                    date = date.rstrip(", %s" % (time))
                    time_of_day = time
            when = dt.datetime.strptime(date, "%A, %B %d, %Y")
            event = Event(
                session,
                when,
                'cow:meeting',
                "%s session on %s" % (
                  time_of_day,
                    date
                ) if time_of_day else "Session on %s" % (date),
                location='Parliament Buildings',
                record_id=hansard_id  # Official record's ID for speeches.
            )
            for x in web_links:
                event.add_document(x.text_content(),
                                   x.attrib['href'],
                                   type="transcript",
                                   mimetype="text/html")
            for x in pdf_links:
                event.add_document(x.text_content(),
                                   x.attrib['href'],
                                   type="transcript",
                                   mimetype="application/pdf")
            event.add_source(url)
            self.save_object(event)

            for a in web_links:
                self.scrape_hansard(session, 'lower',
                                    a.attrib['href'], hansard_id)
示例#24
0
    def scrape_agenda(self, url, session):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']
        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        if time in all_day:
            datetime = date
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime:
            # XXX: Do something more advanced.
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(session,
                      datetime,
                      'committee:meeting',
                      'Meeting Notice',
                      location=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(bill.text_content(),
                               bill_ft,
                               type="full-text",
                               mimetype="application/pdf")
            root = bill.xpath('../../*')
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext(
            ).text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            event.add_related_bill(bill_id,
                                   description=descr,
                                   type='consideration')
        committee = page.xpath("//span[@id='lblSession']")[0].text_content()
        chambers = {"house": "lower", "joint": "joint", "senate": "upper"}
        chamber = "other"
        for key in chambers:
            if key in committee.lower():
                chamber = chambers[key]

        event.add_participant("host", committee, 'committee', chamber=chamber)

        self.save_event(event)