Пример #1
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name,
                      session=self.session,
                      when=when,
                      location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
Пример #2
0
    def get_events(self):
        page = self.lxmlize(PAGE)
        events = page.xpath("//div[@class='col-middle']//ul/li")
        when = None
        for event in events:
            h3 = event.xpath("./a/h2")
            h3 = h3[0] if h3 else None
            if h3 is not None:
                when = h3.text
            else:
                if when is None:
                    self.warning("Ungrok!")
                    continue

                b, _, i = event.xpath("./p/*")
                title = b.text_content()
                event = i.text_content()

                if "NO MEETING" in event:
                    continue

                day, title = (x.strip() for x in title.split("-", 1))

                where = "Council Chambers"

                for subevent in (x.strip() for x in event.split(";")):
                    if " in " in subevent:
                        subevent, where = subevent.rsplit(" in ", 1)
                    subevent = subevent.replace(u'\xa0', ' ')

                    if "NO" in subevent and "MEETING" in subevent:
                        continue

                    if "to follow" in subevent:
                        continue

                    info = EVENT_RE.match(subevent).groupdict()
                    event, time = [info[x] for x in ['event', 'time']]

                    ampm = {
                        "a.m.": "AM",
                        "p.m.": "PM",
                    }

                    for old, new in ampm.items():
                        time = time.replace(old, new)

                    dtstring = ", ".join([day, time])

                    try:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I:%M %p")
                    except ValueError:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I%p")

                    e = Event(name=event, when=etime, location=where)
                    e.add_source(PAGE)
                    yield e
Пример #3
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://legistar.council.nyc.gov/Calendar.aspx"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            els = row.xpath(".//td")
            if len(els) <= 2:
                continue  # Odd one-off.

            (name, date, _, time, where, topic, details, agenda, minutes,
             media) = els
            # _ nom's the image of the cal next to the meeting date.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()
            topic = topic.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime(
                    "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            details = details.xpath(".//a[@href]")
            for detail in details:
                event.add_document(detail.text,
                                   detail.attrib['href'],
                                   mimetype='text/html')

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_document(a.text,
                                   a.attrib['href'],
                                   mimetype='application/pdf')

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_document(minute.text,
                                   minute.attrib['href'],
                                   mimetype='application/pdf')

            yield event
Пример #4
0
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath(
            "//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p")
                      for x in (start, end))

        event = Event(session=self.session,
                      name=title,
                      location=ret['Where:'],
                      when=start,
                      end=end)
        event.add_source(url)
        yield event
Пример #5
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        curdate = None
        page = self.lxmlize(CAL_PAGE)
        for el in page.xpath("//div[@id='Section1']/*"):
            if el.tag[0] == 'h':
                when = WHEN.findall(el.text_content())
                when = when[0] if when else None
                if when is None:
                    continue
                curdate = " ".join(when)

            if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'):

                els = el.xpath("./*")
                agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
                agenda = agenda[0] if agenda else None
                if agenda is None:
                    continue

                info = self.cleanup(el.text_content())
                when = DT.findall(info)
                when = when[0] if when else None
                if when is None:
                    continue

                people = el.xpath(".//personname")
                places = el.xpath(".//place")
                time, ampm = when

                if curdate is None:
                    self.warning("Can't scrape, since I don't know what date it is")
                    continue

                tbuf = " ".join([curdate, time, ampm])
                obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")

                try:
                    _, where = info.rsplit(u"–", 1)
                except ValueError:
                    continue

                where = where.replace(u" ", " ")
                where  = re.sub("\s+", " ", where).strip()
                where = re.sub("agenda$", "", where).strip()

                event = Event(name=info,
                              session=self.session,
                              when=obj,
                              location=where)
                event.add_source(CAL_PAGE)
                yield event
Пример #6
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        start = dt.datetime.utcnow()
        start = start - dt.timedelta(days=10)
        end = start + dt.timedelta(days=30)

        url = URL.format(**{
            "from": start.strftime("%Y/%m/%d"),
            "til": end.strftime("%Y/%m/%d")
        })

        page = self.lxmlize(url)
        events = page.xpath("//ul[contains(@class, 'committee-events')]//li")

        for event in events:
            string = event.text_content()

            po = CLICK_INFO.match(event.xpath(".//span")[0].attrib['onclick'])
            if po is None:
                continue

            poid = po.groupdict()[
                'info_id']  # This is used to get more deetz on

            popage = self.popOverUrl(poid)
            when = dt.datetime.strptime(
                popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p")
            who = popage.xpath("//h1")[0].text
            related = []

            for item in popage.xpath("//div"):
                t = item.text
                if t is None:
                    continue

                t = t.strip()
                for related_entity in ORD_INFO.findall(t):
                    related.append({"ord_no": related_entity, "what": t})

            e = Event(name=who,
                      session=self.session,
                      when=when,
                      location='unknown')
            e.add_source(url)

            for o in related:
                i = e.add_agenda_item(o['what'])
                i.add_bill(o['ord_no'], note='consideration')

            yield e
Пример #7
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        url = "http://chicago.legistar.com/Calendar.aspx/"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            if "No records were found." in row.text_content():
                self.warning("Hum. They don't seem to have events?")
                continue

            (name, date, _, time, where, details, notice,
             agenda, summary, video) = row.xpath(".//td")
            # _ nom's the image next to the date on the page.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(),
                                            "%m/%d/%Y")
            else:
                when = dt.datetime.strptime("%s %s" % (date.text.strip(), time),
                                            "%m/%d/%Y %I:%M %p")

            event = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)
            event.add_source(url)

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_link(a.text, a.attrib['href'])

            summary = summary.xpath(".//a[@href]")
            for minute in summary:
                event.add_link(minute.text, minute.attrib['href'])

            yield event
Пример #8
0
    def scrape_event(self, href):
        page = self.lxmlize(href.attrib['href'])
        what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text
        info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:]
        ret = {
            "Location:": "Unknown"
        }
        for tr in info:
            tds = tr.xpath(".//td")
            if len(tds) < 2:
                continue
            what, data = [tds.pop(0).text_content().strip() for x in range(2)]
            ret[what] = data

        agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]")
        if agendas:
            for agenda in agendas:
                print("Agenda:", agenda.attrib['href'])

        t = ret['Time:']
        start_time, end_time = t, None
        if "-" in t:
            start_time, end_time = (x.strip() for x in t.split("-", 1))

        start_time = "%s %s" % (ret['Date:'], start_time)
        dts = "%B %d, %Y %I:%M %p"
        start = dt.datetime.strptime(start_time, dts)

        end = None
        if end_time:
            end = "%s %s" % (ret['Date:'], end_time)
            end = dt.datetime.strptime(end, dts)

        kwargs = {}
        if end:
            kwargs['end'] = end

        e = Event(name=what, session=self.session, location=ret['Location:'], when=start,
                  **kwargs)
        e.add_source(href.attrib['href'])
        yield e
Пример #9
0
    def scrape_event_page(self, page):
        for entry in page.xpath(
                "//table[@id='Listview1_DataGrid1']//tr[@class='mainText']"):
            title = None
            ret = {}
            for block in entry.xpath(".//td[@class='mainText']"):
                entries = block.xpath("./*")
                if "table" in (x.tag for x in entries):
                    continue
                info = [self.cleanup(x.text_content()) for x in entries]
                if title is None:
                    title = info[1]
                    continue
                key = info.pop(0)
                val = None
                if "Time: " in key:
                    _, val = key.split("Time: ", 1)
                    start, end = val.split(" - ", 1)
                    val = {"start": start, "end": end}
                    key = "time"
                else:
                    val = info.pop(0) if info else None

                ret[key] = val
                if info != []:
                    raise Exception("Erm. odd scrape.")

            if title is None:
                continue

            ret['title'] = title
            start, end = self.get_start_end(ret)
            ret['time']['start'], ret['time']['end'] = start, end

            event = Event(name=ret['Description:'] or "TBA",
                          session=self.session,
                          location=ret['Location:'],
                          when=ret['time']['start'],
                          end=ret['time']['end'])
            yield event
Пример #10
0
    def get_events(self):
        for page in self.eventPages(EVENTSPAGE) :
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table) :
                print(events)
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1] :
                    status = status_string[1].lower()
                    if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] :
                        print(status)
                        status = 'confirmed'
                else :
                    status = 'confirmed'



                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(time_string,
                                                        "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                e = Event(name=events["Name"]["label"],
                          session=self.session,
                          when=when,
                          location=location,
                          status=status)
                e.add_source(EVENTSPAGE)
                if events['Video'] != u'Not\xa0available' :
                    print(events['Video'])

                yield e
Пример #11
0
    def handle_buffer(self, buf):
        dates = DATE_FINDER.findall(buf)
        if dates == []:
            return
        month, day, year = dates[0]
        _, buf = buf.split(year, 1)
        time = TIME_FINDER.findall(buf)
        time = time[0] if time else None

        all_day = time is None

        tbuf = "%s %s %s" % (month, day, year)
        fmt = "%B %d %Y"

        dt_replace = {"Noon": "PM"}
        et_replace = [["–", "-"],
                      [r"^\s+\-\s+", ""]]

        if not all_day:
            tbuf += " %s" % (time)
            fmt += " %I:%M %p"

        for k, v in dt_replace.items():
            tbuf = tbuf.replace(k, v)

        for k, v in et_replace:
            buf = re.sub(k, v, buf)

        buf = buf.strip()

        obj = dt.datetime.strptime(tbuf, fmt)
        e = Event(name=buf,
                  session=self.session,
                  when=obj,
                  location="City Hall")
        yield e
Пример #12
0
    def get_events(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e
Пример #13
0
    def get_events(self):
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

        # scrape attendance

        tmpdir = tempfile.mkdtemp()

        page = lxmlize(
            "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"
        )
        members = page.xpath(
            '//td[@class="inputText"]/select[@name="memberId"]/option')
        for member in members:
            post = {
                'function': 'getMemberAttendanceReport',
                'download': 'csv',
                'exportPublishReportId': 1,
                'termId': 4,
                'memberId': member.attrib['value'],
                'decisionBodyId': 0,
            }
            r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                              data=post)
            if r.headers['content-type'] != 'application/vnd.ms-excel':
                continue

            attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w')
            attendance_file.write(r.text)
            attendance_file.close()


# scrape events
        post = {
            'function': 'getMeetingScheduleReport',
            'download': 'csv',
            'exportPublishReportId': 3,
            'termId': 4,
            'decisionBodyId': 0,
        }

        r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                          data=post)
        empty = []

        meeting_file = open('meetings.csv', 'w')
        meeting_file.write(r.text)
        meeting_file.close()
        with open('meetings.csv', 'rb') as csvfile:
            csvfile = csv.reader(csvfile, delimiter=',')
            next(csvfile)

            committee = ''
            agenda_items = []

            for row in csvfile:
                name = row[0]
                when = row[2]
                when = dt.datetime.strptime(when, "%Y-%m-%d")
                location = row[5]

                if name != committee:
                    committee = name
                    agenda_items = find_items(committee)

                e = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)

                attendees = find_attendees(tmpdir, row)
                if len(attendees) == 0:
                    empty.append(row)
                for attendee in find_attendees(tmpdir, row):
                    e.add_person(attendee)
                e.add_source(
                    "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
                )

                for item in agenda_items:
                    if item['date'].date() == when.date():
                        i = e.add_agenda_item(item['description'])
                        i.add_committee(committee)
                        i['order'] = item['order']

                        for link in item['links']:
                            i.add_media_link(link['name'],
                                             link['url'],
                                             on_duplicate='ignore')

                        if 'notes' in item:
                            i['notes'] = [item['notes']]

                yield e

        shutil.rmtree(tmpdir)
        os.remove('meetings.csv')