예제 #1
0
    def get_events(self):
        # get list of executive orders
        url = 'http://www.governor.ny.gov/sl2/ExecutiveOrderindex'
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # extract governor's name
        gov = page.xpath("(//div[@class='section-header']/div/div/div/a/div/h2)[1]")[0]
        governor_name = gov.text.lstrip('Governor ')

        # scrape each executive order
        for eo_par in page.xpath("//div[@class='content']/p"):
            for link in eo_par.xpath(".//a"):

                url = link.get('href').lower()
                if url.endswith('.pdf'):
                    continue

                # get date for executive order
                eo_page = self.urlopen(url)
                eo_page = lxml.html.fromstring(eo_page)
                eo_page = re.sub('(\\r*\\n|\W)', ' ', eo_page.xpath('string()').lower())
                eo_page = re.sub('\s+', ' ', eo_page)
                date_par = re.search('(?:g i v e n)(.*)(?:by the governor)', eo_page).groups()[0]
                date_comp = [s.strip() for s in
                             re.match('(?:.*this)(.*)(?:day of)(.*)(?:in the year)(.*)', date_par).groups()]
                eo_date = dt.datetime.strptime(' '.join(
                    (str(Wtn.parse(date_comp[0])), date_comp[1], str(Wtn.parse(date_comp[2])))), '%d %B %Y')

                # build yield object
                eo_number = eo_par.xpath('string()').split(':', 1)[0]
                eo = Event(eo_number, eo_date, 'New York')
                eo.add_person(governor_name, 'governor')
                eo.description = link.text
                eo.add_document(eo_number, url, 'text/html')
                eo.add_source(url)

                yield eo

        # TODO: get list of press statements
예제 #2
0
    def get_events(self):
        # get list of executive orders
        url = 'http://nj.gov/infobank/circular/eoindex.htm'
        page = self.urlopen(url)
        page = lxml_html.fromstring(page)
        page.make_links_absolute(url)

        # state variables for parser
        governor_name = None
        gov_session_name = None

        # parse the table of executive orders
        for eo_row in page.xpath('//table[@border>0]//tr'):

            cols = eo_row.xpath('.//td')

            # extract governor's name
            if len(cols) == 1:
                # remove things like "'s"
                governor_name = re.sub('\W\w\s', ' ', eo_row.xpath('string()'))
                governor_name = re.sub('\\r*\\n|\W', ' ', governor_name)
                governor_name = re.sub('\s+', ' ', governor_name)
                governor_name = re.search("executive order.*governor(.*)administration",
                                          governor_name, re.IGNORECASE).groups()[0].strip()
                gov_session_name = re.sub('\s+', '_', governor_name)

            # extract executive order
            elif len(cols) == 3:
                if self.session == gov_session_name:
                    eo_num = cols[0].xpath('string()').strip()
                    try:
                        float(eo_num)
                    except ValueError:
                        continue

                    eo_title = re.sub('\\r*\\n', ' ', cols[1].xpath('string()'))
                    eo_title = re.sub('\s+', ' ', eo_title)
                    eo_title = re.sub('\[.*pdf.*\]', '', eo_title).strip()
                    if eo_title == '' or eo_title is None:
                        continue

                    eo_date = re.search('([0-9]{1,2}).*/([0-9]{1,2}).*/([0-9]{4}|[0-9]{2})', cols[2].xpath('string()'))
                    if eo_date is None:
                        continue
                    eo_date = '/'.join(eo_date.groups())
                    try:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%y')
                    except ValueError:
                        eo_date = dt.datetime.strptime(eo_date, '%m/%d/%Y')

                    eo_source = cols[0].xpath('.//a')[0].get('href').lower()
                    mime_type = MimeTypes().guess_type(eo_source)[0]
                    if mime_type is None:
                        mime_type = 'text/html'

                    # build yield object
                    eo = Event(eo_num, eo_date, 'New Jersey', gov_session_name)
                    eo.add_person(governor_name, 'governor')
                    eo.description = eo_title
                    eo.add_document(eo_num, eo_source, mime_type)
                    eo.add_source(eo_source)

                    yield eo