Exemplo n.º 1
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'
                      .format(item['CommitteeMeetingStatusName'])
                      )

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId'])
                    )

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
Exemplo n.º 2
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'.format(
                          item['CommitteeMeetingStatusName']))

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']),
                            id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId']))

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
Exemplo n.º 3
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr,
                          start_time=when,
                          classification='committee-meeting',
                          description=descr,
                          location_name=where,
                          timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {
                        "chamber": "unknown",
                        "name": committee
                    })

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(bill['bill_id'], note=bill['type'])
            yield event
Exemplo n.º 4
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Exemplo n.º 5
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Exemplo n.º 6
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event
Exemplo n.º 7
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record['CommHouse']]
            except KeyError:
                self.warning('unknown committee code %s, skipping',
                             record['CommHouse'])

            description = 'Meeting of the {}'.format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record['Location'] or 'Statehouse',
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill['bill_id'])

            event.add_committee(
                hr_name,
                id=record['CommHouse'],
                note='host',
            )
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')

            yield event
Exemplo n.º 8
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = self.jurisdiction.get_year_slug(session)

        url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format(
            year_slug
        )

        json_data = self.get(url).text
        events = json.loads(json_data)["data"]

        for info in events:
            # Determine when the committee meets
            if (
                info["TimeSlot"] == ""
                or info["TimeSlot"] == "1"
                or info["TimeSlot"] == 1
            ):
                start_time = datetime.datetime.strptime(
                    info["MeetingDate"], "%A, %B %d, %Y"
                )
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["TimeSlot"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["StartTime"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                all_day = False

            event = Event(
                start_date=self.TIMEZONE.localize(start_time),
                all_day=all_day,
                name="Meeting of the {}".format(info["LongName"]),
                description="committee meeting",
                location_name="{0}, Room {1}".format(
                    info["BuildingName"], info["RoomNbr"]
                ),
            )
            event.add_source(url)
            event.add_committee(name=info["LongName"], note="host")

            yield event
Exemplo n.º 9
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record['CommHouse']]
            except KeyError:
                self.warning('unknown committee code %s, skipping', record['CommHouse'])

            description = 'Meeting of the {}'.format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record['Location'] or 'Statehouse',
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill['bill_id'])

            event.add_committee(
                hr_name,
                id=record['CommHouse'],
                note='host',
            )
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')

            yield event
Exemplo n.º 10
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = session[5:]

        url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.\
                format(year_slug)

        json_data = self.get(url).text
        events = json.loads(json_data)['data']

        for info in events:
            # Determine when the committee meets
            if info['TimeSlot'] == '1':
                start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y')
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['TimeSlot'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['StartTime'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                all_day = False

            event = Event(
                start_time=self.TIMEZONE.localize(start_time),
                timezone='America/New_York',
                all_day=all_day,
                name="Meeting of the {}".format(info['LongName']),
                description="committee meeting",
                location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr'])
            )
            event.add_source(url)
            event.add_committee(
                name=info['LongName'],
                note='host'
            )

            yield event
Exemplo n.º 11
0
 def scrape(self):
     url = 'https://lims.minneapolismn.gov/Calendar/GetCalenderList?'
     council_events = cal_list
     for c in council_events:
         mtg_time = datetime.strptime(c['MeetingTime'], CAL_DATE_FORMAT)
         dt = tz.localize(mtg_time)
         e = Event(name=c['CommitteeName'],
                   start_date=dt,
                   location_name=c['Location'])
         e.add_committee(c['CommitteeName'])
         e.add_source(url)
         if c['MarkedAgendaPublished'] == True:
             event_url = "{0}{1}/{2}".format(AGENDA_BASE_URL,
                                             c['Abbreviation'],
                                             c['AgendaId'])
             e.add_media_link(note="Agenda",
                              url=event_url,
                              media_type="link")
         yield e
Exemplo n.º 12
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = self.jurisdiction.get_year_slug(session)

        url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.format(year_slug)

        json_data = self.get(url).text
        events = json.loads(json_data)['data']

        for info in events:
            # Determine when the committee meets
            if info['TimeSlot'] == '' or info['TimeSlot'] == '1':
                start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y')
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['TimeSlot'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['StartTime'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                all_day = False

            event = Event(
                start_date=self.TIMEZONE.localize(start_time),
                all_day=all_day,
                name="Meeting of the {}".format(info['LongName']),
                description="committee meeting",
                location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr'])
            )
            event.add_source(url)
            event.add_committee(
                name=info['LongName'],
                note='host'
            )

            yield event
Exemplo n.º 13
0
    def scrape(self):

        current_date = datetime.today()
        current_month = current_date.month
        current_year = current_date.year

        date_range = []

        print(current_month)

        for x in range(0, 4):
            if not current_month == 12:
                cm = current_month
                if len(str(cm)) < 2:
                    cm = '0{0}'.format(cm)
                    timestamp = "{0}-{1}".format(current_year, cm)
                    date_range.append(timestamp)
                    current_month += 1

            elif current_month == 12:
                cm = '12'
                timestamp = "{0}-{1}".format(current_year, cm)
                date_range.append(timestamp)
                current_month = 1
                current_year += 1

        format1 = "%A %B %d, %Y - %I:%M %p"
        format2 = "%A %B %d, %Y - "
        format3 = "%m/%d/%y"
        for date in date_range:
            root = requests.get("https://www.stpaul.gov/calendar/" + date)
            base = html.fromstring(root.text)
            items = base.xpath('.//*/div[@class="view-content"]/div')
            meetings = []
            for i in items:
                if len(
                        i.xpath(
                            './/*/span[@class="date-display-single"]/text()')
                ) > 0:
                    d = {}
                    d['date'] = i.xpath(
                        './/*/span[@class="date-display-single"]/text()')[0]
                    d['info'] = i.xpath(
                        './/*/span[@class="field-content"]/a/text()')[0]
                    d['link'] = i.xpath(
                        './/*/span[@class="field-content"]/a/@href')[0]
                    meetings.append(d)
            for m in meetings:
                m['link'] = "https://www.stpaul.gov" + m['link']
            for m in meetings:
                ppr(m['info'])
                r = requests.get(m['link'])
                b = html.fromstring(r.text)
                exists = b.xpath('.//div[@class="node-content clearfix"]')
                if len(exists) > 0:
                    date = exists[0].xpath(
                        './/*/span[@class="date-display-single"]/text()')
                    loc1 = exists[0].xpath(
                        './/*/div[@class="thoroughfare"]/text()')
                    loc2 = exists[0].xpath('.//*/div[@class="premise"]/text()')
                    if len(loc1) > 0:
                        m['location'] = loc1[0]
                    if len(loc2) > 0:
                        m['location'] = m['location'] + " " + loc2[0]
                    else:
                        m['location'] = 'N/A'
                    if ":" in date[0]:
                        m['date'] = datetime.strptime(date[0], format1)
                    elif "/" in date[0]:
                        new_date = date[0].split('/')
                        for n in new_date:
                            if len(n) == 1:
                                n = '0' + n
                                new_date = '/'.join(new_date)
                                m['date'] = datetime.strptime(
                                    new_date, format3)
                    else:
                        date = datetime.strptime(date[0], format2)
                        m['date'] = date
                    m['date'] = tz.localize(m['date'])
                    if not 'City Council' in m[
                            'info'] and not 'Legislative' in m[
                                'info'] and not 'Holiday' in m['info']:

                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                        m['name'] = m['info'].replace('Meeting', '').replace(
                            ' - Cancelled', '').replace('Events', '').strip()
                        event.add_committee(m['name'])
                    elif 'Holiday' in m['info']:
                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                    else:
                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                        event.add_committee('Saint Paul City Council')
                    event.add_source(m['link'])
                    yield event
Exemplo n.º 14
0
    def scrape(self):

        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()

            if self.short_ids.get(committee):
                descr = "{} {}".format(
                    self.chambers[self.short_ids[committee]["chamber"]],
                    self.short_ids[committee]["name"],
                )
            else:
                descr = [x.text_content() for x in tds[1].xpath(".//span")]
                if len(descr) != 1:
                    raise Exception
                descr = descr[0].replace(".", "").strip()

            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib["href"]
            notice_name = notice.text

            # the listing page shows the same hearing in multiple rows.
            # combine these -- get_related_bills() will take care of adding the bills
            # and descriptions
            if notice_href in self.seen_hearings:
                continue
            else:
                self.seen_hearings.append(notice_href)

            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = TIMEZONE.localize(when)
            event = Event(
                name=descr,
                start_date=when,
                classification="committee-meeting",
                description=descr,
                location_name=where,
            )

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee and committee in self.short_ids:
                    committee = "{} {}".format(
                        self.chambers[self.short_ids[committee]["chamber"]],
                        self.short_ids[committee]["name"],
                    )
                event.add_committee(committee, note="host")

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type="text/html")
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill["descr"].strip())
                a.add_bill(bill["bill_id"], note=bill["type"])
            yield event
Exemplo n.º 15
0
    def scrape_chamber(self, chamber=None):
        # If chamber is None, don't exclude any events from the results based on chamber
        chmbr = cal_chamber_text.get(chamber)
        tables = url_xpath(cal_weekly_events,
                           "//table[@class='date-table']")
        for table in tables:
            date = table.xpath("../.")[0].getprevious().text_content()
            trs = table.xpath("./tr")
            for tr in trs:
                order = ["time", "chamber", "type", "agenda", "location",
                         "video"]

                tds = tr.xpath("./td")
                metainf = {}

                if not tds:
                    continue

                for el in range(0, len(order)):
                    metainf[order[el]] = tds[el]

                if chmbr and metainf['chamber'].text_content() != chmbr:
                    self.info("Skipping event based on chamber.")
                    continue

                time = metainf['time'].text_content()
                datetime_string = "%s %s" % \
                                  (date.strip(' \r\n'), time.strip(' \r\n'))
                location = metainf['location'].text_content()
                description = metainf['type'].text_content()
                dtfmt = "%A, %B %d, %Y %I:%M %p"
                dtfmt_no_time = "%A, %B %d, %Y"
                if time == 'Cancelled':
                    self.log("Skipping cancelled event.")
                    continue
                else:
                    if "Immediately follows H-FLOOR" in datetime_string:
                        continue
                    if ' Immediately follows' in datetime_string:
                        datetime_string, _ = datetime_string.split(
                            'Immediately follows')
                    if "canceled" in datetime_string.lower():
                        continue
                    if "TBA" in datetime_string:
                        continue

                    datetime_string = datetime_string.strip()

                    try:
                        when = dt.datetime.strptime(datetime_string, dtfmt)
                    except ValueError:
                        when = dt.datetime.strptime(datetime_string, dtfmt_no_time)
                    when = self._utc.localize(when)

                event = Event(
                    name=description,
                    start_date=when,
                    location_name=location,
                    description=description,
                )
                # The description is a committee name
                event.add_committee(name=description)
                event.add_source(cal_weekly_events)

                agenda = metainf['agenda'].xpath(".//a")
                if len(agenda) > 0:
                    agenda = agenda
                    for doc in agenda:
                        if not doc.text_content():
                            continue
                        agenda_url = doc.attrib['href']
                        self.add_agenda(
                            agenda_url, doc.text_content(), event)
                yield event
Exemplo n.º 16
0
    def scrape(self, start_time=None):

        if start_time is None:
            start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc)

        dupes = {}
        uniq = {}
        bad_ids = []

        for i, hearing in enumerate(self.congressional_hearings(start_time)):
            package_id = hearing['packageId']
            try:
                package_num, = re.findall('\d+$', package_id)
            except ValueError:
                bad_ids.append(package_id)
                continue
            # For appropriations hearings, the committees tend to
            # publish portions of the hearings as they are completed,
            # and then the final hearing are usually compiled,
            # printed, and added to the repository at the request of
            # the Committee.
            #
            # packages with 8 digits after hrg are the in-process
            # version
            #
            # There could be some time between the in-process and
            # final packages. Publication of hearings is the purview
            # of the committee.
            #
            # https://github.com/usgpo/api/issues/21#issuecomment-435926223
            if len(package_num) == 8:
                continue

            mods_link = hearing['download']['modsLink']
            response = self.get(mods_link)
            mods = xmltodict.parse(response.content)
            extension = collections.ChainMap(*mods['mods']['extension'])

            granule_class = extension.get('granuleClass', 'boo')
            if granule_class == 'ERRATA':
                continue

            meeting_type = self._meeting_type(extension)
            if meeting_type is None:
                continue

            held_date = extension['heldDate']
            if type(held_date) is list:
                start_date = min(held_date)
            else:
                start_date = held_date

            event = Event(name=self._title(mods),
                          start_date=start_date,
                          classification=meeting_type,
                          location_name='unknown')
            if not event.name:
                continue

            if 'number' in extension:
                hearing_number = '{docClass} {congress}-{number}'.format(
                    **extension)
                print(hearing_number)
                event.extras['hearing_number'] = hearing_number

            for committee_d in self._unique(extension.get('congCommittee',
                                                          [])):
                names = committee_d['name']
                committee_name = self._name_type(names, 'authority-standard')
                if committee_name is None:
                    committee_name = self._name_type(names, 'authority-short')

                if committee_d['@chamber'] == 'H':
                    committee_name = 'House ' + committee_name
                elif committee_d['@chamber'] == 'S':
                    committee_name = 'Senate ' + committee_name

                try:
                    thomas_id = committee_d['@authorityId'].upper()
                except KeyError:
                    thomas_id = None

                sub_committees = self._subcommittees(committee_d)
                if sub_committees:
                    for sub_committee_d in sub_committees:
                        sub_committee_name = sub_committee_d['name']['#text']
                        sub_committee_name = sub_committee_name.strip(
                            string.punctuation)
                        sub_committee_id = _make_pseudo_id(
                            name=sub_committee_name,
                            parent__identifiers__identifier=thomas_id)
                        ret = {
                            "name": sub_committee_name,
                            "entity_type": 'organization',
                            "note": 'host',
                            "organization_id": sub_committee_id,
                        }
                        event.participants.append(ret)

                else:
                    if thomas_id:
                        ret = {
                            "name":
                            committee_name,
                            "entity_type":
                            'organization',
                            "note":
                            'host',
                            "organization_id":
                            _make_pseudo_id(identifiers__identifier=thomas_id)
                        }
                        event.participants.append(ret)
                    else:
                        event.add_committee(committee_name, note='host')

            links = mods['mods']['location']['url']
            for link in self._unique(links):
                if link['@displayLabel'] == 'Content Detail':
                    event.add_source(link['#text'], note='web')
                elif link['@displayLabel'] == 'HTML rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='text/html')
                elif link['@displayLabel'] == 'PDF rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='application/pdf')

            event.add_source(mods_link, note='API')

            self._unique_event(uniq, event, dupes)

        self._house_docs(uniq)

        for event in uniq.values():
            yield event

        with open('bad_ids.txt', 'w') as f:
            for id in bad_ids:
                f.write(id + '\n')
Exemplo n.º 17
0
    def _parse_house_floor_xml_legislative_activity(self, xml):
        """
        Parses XML string of House floor updates and yields them in loop.

        @param xml: XML of field update
        @type xml: string
        @return: complete Event object
        @rtype: Event
        """
        tree = self._xml_parser(xml)

        congress = tree.xpath('.//legislative_congress')[0].get('congress')

        house_committees = self._get_current_house_committee_names()
        for fa in tree.xpath('.//floor_action'):
            fa_text = fa.xpath('.//action_description')[0].xpath('string()')

            eastern = pytz.timezone('US/Eastern')
            dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S')
            event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')),
                          eastern.localize(dt).astimezone(pytz.utc),
                          'US/Eastern',
                          '',
                          description=fa_text,
                          classification='floor_update')

            event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004",
                               note='House Floor', url='http://www.house.gov',
                               coordinates={'latitude': '38.889931', 'longitude': '-77.009003'})

            event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')),
                             note="Scraped from the Office of the Clerk, U.S. House of Representatives website.")

            event.extras['act-id'] = fa.get('act-id')
            event.extras['unique-id'] = fa.get('unique-id')

            # bills
            ai_b = event.add_agenda_item(description='Bills referenced by this update.')
            for bill in fa.xpath(".//a[@rel='bill']"):
                bill_name = bill.xpath('string()')
                ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress),
                              note="Bill was referenced on the House floor.")

            # publaws
            ai_p = event.add_agenda_item(description='Public laws referenced by this update.')
            for law in fa.xpath(".//a[@rel='publaw']"):
                detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html'
                ai_p.add_bill(law.xpath('string()'),
                              id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)),
                              note='Law was referenced on the House floor.')

            # votes
            ai_v = event.add_agenda_item(description='Votes referenced by this update.')
            for vote in fa.xpath(".//a[@rel='vote']"):
                vote_name = vote.xpath('string()')
                ai_v.add_vote(vote_name,
                              id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress),
                              note='Vote was referenced on the House floor.')

            # reports
            for report in fa.xpath(".//a[@rel='report']"):
                event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html')

            for name in house_committees:
                if name.replace('House ', '') in fa_text:
                    event.add_committee(name, id=make_pseudo_id(name=name))

            # TODO identify legislators and add them as participants?


            yield event
Exemplo n.º 18
0
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2017):
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location']:
                location_string, note = event[u'Meeting Location'].split(
                    '--em--')[:2]
                for each in note.split(' - '):
                    if each.startswith('Join'):
                        other_orgs = each
                    else:
                        extras.append(each)
            else:
                location_string = event[u'Meeting Location']

            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            response = self.get(event['iCalendar']['url'], verify=False)
            event_time = self.ical(
                response.text).subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred', ):
                status = 'cancelled'
            elif self.now() < when:
                status = 'confirmed'
            else:
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events:
                continue
            else:
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_date=when,
                      description=description,
                      location_name=location,
                      status=status)

            if extras:
                e.extras = {'location note': ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower():
                participating_orgs = [event["Name"]]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            if agenda:
                e.add_source(event["Meeting Details"]['url'], note='web')

                for item, _, _ in agenda:
                    if item["Name"]:
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"]:
                            if item['Action']:
                                note = item['Action']
                            else:
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else:
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
Exemplo n.º 19
0
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2011) :
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location'] :
                location_string, note = event[u'Meeting Location'].split('--em--')[:2]
                for each in note.split(' - ') :
                    if each.startswith('Join') :
                        other_orgs = each
                    else :
                        extras.append(each)
            else :
                location_string = event[u'Meeting Location'] 
            
            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred',) :
                status = 'cancelled'
            elif self.now() < when :
                status = 'confirmed'
            else :
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description 
                   for each 
                   in ('Multiple meeting items',
                       'AGENDA TO BE ANNOUNCED')) :
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events :
                continue
            else :
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_time=when,
                      timezone=self.TIMEZONE,
                      description=description,
                      location_name=location,
                      status=status)

            if extras :
                e.extras = {'location note' : ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Multimedia']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting' :
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower() :
                participating_orgs = [event["Name"]]
            else :
                participating_orgs = []

            if other_orgs : 
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)
 
            for org in participating_orgs :
                e.add_committee(name=org)

            if agenda :
                e.add_source(event["Meeting Details"]['url'])

                
                for item, _, _ in agenda :
                    if item["Name"] :
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"] :
                            if item['Action'] :
                                note = item['Action']
                            else :
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else :
                e.add_source(self.EVENTSPAGE)

            yield e
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event['EventLocation']

            description = event['Meeting\xa0Topic']

            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = None

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=api_event['status'])
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=api_event['status'])

            e.pupa_id = str(api_event['EventId'])

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            location_string = event[u'Meeting Location']
            location_notes, other_orgs = self._parse_location(location_string)

            if location_notes:
                e.extras = {'location note': ' '.join(location_notes)}

            if e.name == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in e.name.lower():
                participating_orgs = [e.name]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()

            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'].strip())

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e
Exemplo n.º 21
0
    def scrape(self):
        for c in comm_base:
            print(c.xpath('.//h3/a/text()'))
        for c in comm_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            print(c.xpath('.//h3/*'))
            title = c.xpath('.//h3/a/text()')
            if len(title) == 0:
                continue
            else:
                m['title'] = title[0]
            m['link'] = c.xpath('.//h3/a/@href')[0]
            info_div = c.xpath('.//div[@class="calendar_p_indent"]')[0]
            print('one info div')
            if info_div is not None:
                info_list = info_div.xpath('.//text()')
                if info_list[0] == 'Room: ':
                    m['room'] = info_list[1]
                if info_list[1] == 'Chair: ':
                    chair = info_list[2]
                    if ',' in chair:
                        chairs = chair.replace('\xa0', '').split(',')
                        nchairs = []
                        for chair in chairs:
                            if chair.startswith('Rep.') or chair.startswith(
                                    'Sen.'):
                                cname = pull_middle_name(chair[4:])
                                nchairs.append(cname.strip())
                        m['chair'] = nchairs
                    elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                        cname = pull_middle_name(chair[4:].strip())
                        m['chair'] = [cname.strip()]
                if info_list[2] == 'Chair: ':
                    chair = info_list[3]
                    if ',' in chair:
                        chairs = chair.replace('\xa0', '').split(',')
                        nchairs = []
                        for chair in chairs:
                            if chair.startswith('Rep.') or chair.startswith(
                                    'Sen.'):
                                cname = pull_middle_name(chair[4:])
                                nchairs.append(cname.strip())
                        m['chair'] = nchairs
                    elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                        cname = pull_middle_name(chair[4:].strip())
                        m['chair'] = [cname.strip()]
                if info_list[4] == 'Agenda: ':
                    m['agenda'] = info_list[5]

            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            ppr(m)
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                ppr(m)
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'])

            if len(m['notice']) > 0:
                pass
            event.add_committee(m['title'])
            event.add_source(m['link'])
            for chair in m['chair']:
                event.add_person(name=chair, note="Chair")
            yield event
Exemplo n.º 22
0
    def scrape(self):
        for c in senate_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            link = c.xpath('.//h3/a/@href')
            print('top link: ', c.xpath('.//h3/*'))
            if len(link) > 0:
                m['link'] = c.xpath('.//h3/a/@href')[0]
                m['title'] = c.xpath('.//h3/a/text()')[0]
            else:
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
                m['title'] = c.xpath('.//h3/text()')[0]
            print('top link 2: ', c.xpath('.//h3/text()'))
            info_div = c.xpath('.//div[@class="calendar_p_indent"]')
            if len(info_div) > 0:
                info_div = info_div[0]
                info_list = info_div.xpath('.//text()')
                nchairs = []
                agenda = False
                for il in info_list:
                    il = il.replace('\xa0', '')
                    if il.startswith(' and '):
                        il = il.replace(' and ', '')
                    if il.startswith('Room'):
                        m['room'] = il
                    if il.startswith('Rep.') or il.startswith('Sen.'):
                        cname = pull_middle_name(il[4:])
                        nchairs.append(cname.strip())
                    if agenda == True:
                        m['agenda'] = il
                    if il == 'Agenda: ':
                        agenda = True
                m['chair'] = nchairs
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            ppr(m)
            date = c.xpath('.//p/span/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                ppr(m)
                continue
            if 'or' in date[0]:
                date[0] = date[0].split('or')[0]
            m['date'] = datetime.datetime.strptime(date[0].replace('\xa0', ''),
                                                   format1)
            ppr(m)
            if not 'room' in m.keys():
                print('oops')
                m['room'] = 'Senate in session'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'])

            if len(m['notice']) > 0:
                pass
            event.add_committee(m['title'])
            event.add_source(m['link'])
            for chair in m['chair']:
                event.add_person(name=chair, note="Chair")
            yield event
Exemplo n.º 23
0
    def scrape(self):
        for c in house_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            links = c.xpath('.//h3/a/@href')            
            if len(links) > 0:
                m['cmt'] = c.xpath('.//h3/a/text()')[0]
                m['link'] = c.xpath('.//h3/a/@href')[0]
                title = c.xpath('.//h3/text()')[0]
                if title == 'Agenda:':
                    m['title'] = c.xpath('.//h3/a/text()')[0]
                else:
                    m['title'] = c.xpath('.//h3/text()')[0]
                
            else:
                m['title'] = c.xpath('.//h3/text()')[0]
                m['link'] = None
            info_div = c.xpath('.//*[@class="calendar_p_indent"]')
            if len(info_div) == 0:
                pass
            else:
                info_div = info_div[0]
            print('Info Div: ', info_div)
            if len(info_div) > 0:
                info_list = info_div.xpath('.//text()')
                info_links = info_div.xpath('.//*/@href')
                print("info links: ", info_links)
                info_list = [x.replace('\n', '').strip() for x in info_list]
                info_list = [x for x in info_list if len(x) > 0]
                print('Info list: ', info_list)
                if info_list[0].startswith('Room:'):
                    m['room'] = info_list[1]
                else:
                    m['room'] = 'n/a'
                if len(info_list) > 2:
                    if info_list[2].startswith('Chair:'):
                        chair = info_list[3]
                        if ',' in chair:
                            chairs = chair.replace('\xa0', '').split(',')
                            nchairs = []
                            for chair in chairs:
                                if chair.startswith('Rep.') or chair.startswith('Sen.'):
                                    cname = pull_middle_name(chair[4:])
                                    nchairs.append(cname.strip())
                            m['chair'] = nchairs
                        elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                            cname = pull_middle_name(chair[4:].strip())
                            m['chair'] = [cname.strip()]
                else:
                    m['chair'] = None
            
            bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr'))
            print('Bills: ', bill_rows)
            bills = []
            for brs in bill_rows:
                cells = brs.xpath('.//td')
                if len(cells) == 3:
                    b = {}
                    b['bill'] = cells[0].xpath('.//text()')[0]
                    b['author'] = cells[1].xpath('./text()')[0]
                    b['summary'] = cells[2].xpath('./text()')[0]
                    bills.append(b)
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            if 'House Meets in Session' in m['title']:
                m['room'] = 'State leg'
                m['cmt'] = 'Minnesota House of Representatives'
                m['chair'] = None
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'] 
            )
            if len(bills) > 0:
                for bill in bills:
                    nbill = event.add_agenda_item(description=bill['summary'])
                    nbill.add_bill(bill['bill'].replace('HF', 'HF '))
            if len(m['notice']) > 0:
                pass
            event.add_committee(m['cmt'])
            if m['link'] is not None:
                event.add_source(m['link'])
            if m['chair'] is not None:
                for chair in m['chair']:
                   event.add_person(name=chair, note="Chair")
            yield event